def h_index_over_time(): def calculate_h_index(pubs, year): pubs = list( filter( lambda pub: pub['year'].isdigit() and pub['citation'].isdigit( ) and int(pub['year']) <= year, pubs)) h_index = 0 for index, pub in enumerate(pubs): if int(pub['citation']) >= index: h_index = index else: break return h_index # researcher.aggregate([ # {'$match':{'gender':{'$exists':1}}}, # {'pubs.year':{'$gt'}} # ]) # print(len(docs)) h_index_arr = [] start = 1980 end = 2018 for year in range(start, end): docs = researcher.find({'gender': {'$exists': 1}}) h_indexs = [calculate_h_index(doc['pubs'], year) for doc in docs] print(sum(h_indexs)) h_index_arr.append(sum(h_indexs)) chart = Chart(30, 30) labels = [str(year) for year in range(start, end)] chart.bar(h_index_arr, end - start, labels, 'h-index change over time', 'time', 'cumulative h_index', True, False) chart.save(charts_path + '/h_index_over_time.eps') chart.clear() chart.close()
def gender_favorite(top_k, sex='M'): docs = table.aggregate([{ '$match': { 'gender': sex } }, { '$unwind': '$labels' }, { '$group': { '_id': { 'label': '$labels' }, 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } }]) number_arr = [] count_arr = [] labels = [] docs = [doc for doc in docs] for doc in docs[:top_k]: count_arr.append(doc['count']) labels.append(doc['_id']['label']) chart = Chart(100, 180) chart.bar(count_arr, top_k, labels, "The Top {0} females' favorite disciplines".format(top_k), 'discipline', 'researcher number', True, log=False, fontsize=120) chart.save(chart_path + '/{1}_favorite_{0}'.format(top_k, sex), format='eps') chart.clear()
def year_distribution(start, end): os.mkdir(chart_path) if not os.path.exists(chart_path) else '' pubs_per_year = [0 for x in range(start, end)] citation_per_year = [0 for x in range(start, end)] # print(pubs_per_year) year_labels = [str(x) for x in range(start, end)] docs = copy.find({}, {'pubs': 1}) for doc in docs: for pub in doc['pubs']: if pub['year'] == '': continue year = int(pub['year']) index = year - start if 0 > index or index >= end - start: continue try: pubs_per_year[index] += 1 if pub['citation'].isdigit(): citation_per_year[index] += int(pub['citation']) except Exception as e: print(pub['citation']) print(len(pub['citation'])) print(index) sys.exit() # pubs_per_year = [year/1000 for year in pubs_per_year] chart = Chart(50, 50) chart.bar(citation_per_year, end - start, year_labels, 'publication number per year', 'year', 'number', log=False, fontsize=10) # chart.bar(pubs_per_year,end-start,year_labels,'publication number per year','year','number',log=False,fontsize=10) chart.save('pub_per_year', format='png') chart.show()
def gender_ratio(top_k): top_n = researcher.aggregate([ {'$match':{'gender':{'$exists':1}}}, {'$unwind':'$labels'}, {'$group':{ '_id':{'label':'$labels'}, 'count':{'$sum':1} }}, {'$sort':{'count':-1}}]) label_ratios = [] labels = [] ratios = [] top_n = [x['_id']['label'] for x in top_n][:top_k] for discipline in top_n: male_size = researcher.count({'gender':'M','labels':discipline}) female_size = researcher.count({'gender':'F','labels':discipline}) # ratio = female_size/male_size ratio = male_size/female_size label_ratios.append((discipline,ratio)) labels.append(discipline) ratios.append(ratio) chart = Chart(top_k/3,top_k/2) chart.bar(ratios,top_k,labels,'Gender Ratio in Top %s Disciplines'%top_k,'discipline','ratio',True,log=False,fontsize=top_k/2) chart.save(charts_path+'/gender_ratio_top_{0}.eps'.format(top_k)) chart.clear() chart.close() label_ratios.sort(key=lambda x:x[1],reverse=True) labels = [x[0] for x in label_ratios] ratios = [x[1] for x in label_ratios] chart = Chart(100,180) chart.bar(ratios,top_k,labels,'Ranked Gender Ratio in Top %s Disciplines'%top_k,'discipline','ratio',True,log=False,fontsize=120) chart.save(charts_path+'/ranked_gender_ratio_top_{0}'.format(top_k),format='eps') chart.clear() chart.close()
def discipline_proportion(top_k): docs = table.aggregate([{ '$match': { 'gender': { '$exists': 1 } } }, { '$unwind': '$labels' }, { '$group': { '_id': { 'label': '$labels' }, 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } }]) docs = [doc for doc in docs] # print(docs[:10]) total = table.count({'gender': {'$exists': 1}}) count_arr = [doc['count'] for doc in docs[:top_k]] proportion_arr = [doc['count'] / total for doc in docs[:top_k]] cumulative_arr = [] c = 0 for i in proportion_arr: c += i cumulative_arr.append(c) labels = [doc['_id']['label'] for doc in docs[:top_k]] # chart = Chart() # print(len(labels)) # print(len(arr)) # chart.pie([arr],'test',labels) # chart.show() # chart.single_unnomarlized_CDF(arr,'disciplines CDF','disciplines','percentage') # chart.save(chart_path+'cdf.eps') # s = '' # print(np.median()) # for label in labels: # s = s+label+', ' # print(s) # os.mkdir(chart_path) if not os.path.exists(chart_path) else '' chart = Chart(100, 150) # chart.bar(count_arr,top_k,labels,'The Top {0} popular disciplines'.format(top_k),'discipline','researcher number',True,log=False,fontsize=100) # chart.show() # chart.save(chart_path+'/number_{0}'.format(top_k),format='eps') # chart.clear() chart.bar(cumulative_arr, top_k, labels, 'Cumulative propotion of most popular disciplines', 'discipline', 'propotion', True, log=False, fontsize=100) chart.save(chart_path + '/cumulative_{0}'.format(top_k), format='eps') chart.clear()