def h_index_over_time(): def calculate_h_index(pubs, year): pubs = list( filter( lambda pub: pub['year'].isdigit() and pub['citation'].isdigit( ) and int(pub['year']) <= year, pubs)) h_index = 0 for index, pub in enumerate(pubs): if int(pub['citation']) >= index: h_index = index else: break return h_index # researcher.aggregate([ # {'$match':{'gender':{'$exists':1}}}, # {'pubs.year':{'$gt'}} # ]) # print(len(docs)) h_index_arr = [] start = 1980 end = 2018 for year in range(start, end): docs = researcher.find({'gender': {'$exists': 1}}) h_indexs = [calculate_h_index(doc['pubs'], year) for doc in docs] print(sum(h_indexs)) h_index_arr.append(sum(h_indexs)) chart = Chart(30, 30) labels = [str(year) for year in range(start, end)] chart.bar(h_index_arr, end - start, labels, 'h-index change over time', 'time', 'cumulative h_index', True, False) chart.save(charts_path + '/h_index_over_time.eps') chart.clear() chart.close()
def gender_favorite(top_k, sex='M'): docs = table.aggregate([{ '$match': { 'gender': sex } }, { '$unwind': '$labels' }, { '$group': { '_id': { 'label': '$labels' }, 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } }]) number_arr = [] count_arr = [] labels = [] docs = [doc for doc in docs] for doc in docs[:top_k]: count_arr.append(doc['count']) labels.append(doc['_id']['label']) chart = Chart(100, 180) chart.bar(count_arr, top_k, labels, "The Top {0} females' favorite disciplines".format(top_k), 'discipline', 'researcher number', True, log=False, fontsize=120) chart.save(chart_path + '/{1}_favorite_{0}'.format(top_k, sex), format='eps') chart.clear()
def declare_prefrence_distribution(discipline='all researchers'): male_declare = researcher.count({'gender':'M','labels.0':{'$exists':1}}) female_declare = researcher.count({'gender':'F','labels.0':{'$exists':1}}) female_not_declare = researcher.count({'gender':'F','labels.0':{'$exists':0}}) male_not_declare = researcher.count({'gender':'M','labels.0':{'$exists':0}}) chart = Chart() chart.pie([female_not_declare,female_declare],'Females\' preference in declaration of research domain',['not declared','declared']) chart.save(charts_path+'/{0}/female_declare_preference'.format(discipline)) chart.clear() chart.close() chart = Chart() chart.pie([male_not_declare,male_declare],'Males\' prefrence in declaration research of domain',['not declared','declared']) chart.save(charts_path+'/{0}/male_declare_preference'.format(discipline)) chart.clear() chart.close()
def year_distribution(start, end): os.mkdir(chart_path) if not os.path.exists(chart_path) else '' pubs_per_year = [0 for x in range(start, end)] citation_per_year = [0 for x in range(start, end)] # print(pubs_per_year) year_labels = [str(x) for x in range(start, end)] docs = copy.find({}, {'pubs': 1}) for doc in docs: for pub in doc['pubs']: if pub['year'] == '': continue year = int(pub['year']) index = year - start if 0 > index or index >= end - start: continue try: pubs_per_year[index] += 1 if pub['citation'].isdigit(): citation_per_year[index] += int(pub['citation']) except Exception as e: print(pub['citation']) print(len(pub['citation'])) print(index) sys.exit() # pubs_per_year = [year/1000 for year in pubs_per_year] chart = Chart(50, 50) chart.bar(citation_per_year, end - start, year_labels, 'publication number per year', 'year', 'number', log=False, fontsize=10) # chart.bar(pubs_per_year,end-start,year_labels,'publication number per year','year','number',log=False,fontsize=10) chart.save('pub_per_year', format='png') chart.show()
def gender_ratio(top_k): top_n = researcher.aggregate([ {'$match':{'gender':{'$exists':1}}}, {'$unwind':'$labels'}, {'$group':{ '_id':{'label':'$labels'}, 'count':{'$sum':1} }}, {'$sort':{'count':-1}}]) label_ratios = [] labels = [] ratios = [] top_n = [x['_id']['label'] for x in top_n][:top_k] for discipline in top_n: male_size = researcher.count({'gender':'M','labels':discipline}) female_size = researcher.count({'gender':'F','labels':discipline}) # ratio = female_size/male_size ratio = male_size/female_size label_ratios.append((discipline,ratio)) labels.append(discipline) ratios.append(ratio) chart = Chart(top_k/3,top_k/2) chart.bar(ratios,top_k,labels,'Gender Ratio in Top %s Disciplines'%top_k,'discipline','ratio',True,log=False,fontsize=top_k/2) chart.save(charts_path+'/gender_ratio_top_{0}.eps'.format(top_k)) chart.clear() chart.close() label_ratios.sort(key=lambda x:x[1],reverse=True) labels = [x[0] for x in label_ratios] ratios = [x[1] for x in label_ratios] chart = Chart(100,180) chart.bar(ratios,top_k,labels,'Ranked Gender Ratio in Top %s Disciplines'%top_k,'discipline','ratio',True,log=False,fontsize=120) chart.save(charts_path+'/ranked_gender_ratio_top_{0}'.format(top_k),format='eps') chart.clear() chart.close()
def pubs_distribution(discipline='all researchers'): os.mkdir(charts_path+'/'+discipline) if not os.path.exists(charts_path+'/'+discipline) else '' male = researcher.aggregate([{'$match':{'gender':'M','labels':discipline}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}])if discipline != 'all researchers' else researcher.aggregate([{'$match':{'gender':'M'}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}]) female = researcher.aggregate([{'$match':{'gender':'F','labels':discipline}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}]) if discipline !='all researchers' else researcher.aggregate([{'$match':{'gender':'F'}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}]) male = [doc['pubs_count'] for doc in male] female = [doc['pubs_count'] for doc in female] chart = Chart() # chart.normalized_CDF(male,female,"publication CDF of {0}".format(discipline),"personal publication number","cumulative proportion",15) # chart.save(charts_path+'/{0}/publication_cdf'.format(discipline),format='png') # chart.clear() chart.histogram(male,female,"publication distribution of {0} ".format(discipline),"personal publication number","research count",15) chart.save(charts_path+'/{0}/publication_histogram'.format(discipline),format='png') chart.clear() chart = Chart(15,10) chart.unnormalized_CDF(male,female,"cumulative publication number of {0}".format(discipline),"personal publication number","cumulative publication",15) chart.save(charts_path+'/{0}/unnormalized_publication_cdf'.format(discipline),format='png') chart.clear() chart.close()
def discipline_proportion(top_k): docs = table.aggregate([{ '$match': { 'gender': { '$exists': 1 } } }, { '$unwind': '$labels' }, { '$group': { '_id': { 'label': '$labels' }, 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } }]) docs = [doc for doc in docs] # print(docs[:10]) total = table.count({'gender': {'$exists': 1}}) count_arr = [doc['count'] for doc in docs[:top_k]] proportion_arr = [doc['count'] / total for doc in docs[:top_k]] cumulative_arr = [] c = 0 for i in proportion_arr: c += i cumulative_arr.append(c) labels = [doc['_id']['label'] for doc in docs[:top_k]] # chart = Chart() # print(len(labels)) # print(len(arr)) # chart.pie([arr],'test',labels) # chart.show() # chart.single_unnomarlized_CDF(arr,'disciplines CDF','disciplines','percentage') # chart.save(chart_path+'cdf.eps') # s = '' # print(np.median()) # for label in labels: # s = s+label+', ' # print(s) # os.mkdir(chart_path) if not os.path.exists(chart_path) else '' chart = Chart(100, 150) # chart.bar(count_arr,top_k,labels,'The Top {0} popular disciplines'.format(top_k),'discipline','researcher number',True,log=False,fontsize=100) # chart.show() # chart.save(chart_path+'/number_{0}'.format(top_k),format='eps') # chart.clear() chart.bar(cumulative_arr, top_k, labels, 'Cumulative propotion of most popular disciplines', 'discipline', 'propotion', True, log=False, fontsize=100) chart.save(chart_path + '/cumulative_{0}'.format(top_k), format='eps') chart.clear()
def citation_and_index_distribution(discipline='all researchers'): docs = researcher.find({'gender':{'$exists':1},'labels':discipline},{'gender':1,'pubs':1,'index':1}) if discipline !='all researchers' else researcher.find({'gender':{'$exists':1}},{'gender':1,'pubs':1,'index':1}) male_citation = [] female_citation = [] male_index = [] female_index = [] pub_not_given_ciatation = 0 profile_not_given_index = 0 for doc in docs: researcher_citation_count = [] citation_sum = 0 if doc['gender'] == 'M': for pub in doc['pubs']: if pub['citation'].isdigit(): citation_sum+=int(pub['citation']) # researcher_citation_count.append(int(pub['citation'])) else: pub_not_given_ciatation+=1 # researcher_citation_count = researcher_citation_count if researcher_citation_count else [0] # male_citation.append(np.mean(researcher_citation_count)) male_citation.append(citation_sum) male_index.append(int(doc['index'])) else: for pub in doc['pubs']: if pub['citation'].isdigit(): citation_sum+=int(pub['citation']) # researcher_citation_count.append(int(pub['citation'])) else: pub_not_given_ciatation+=1 # researcher_citation_count = researcher_citation_count if researcher_citation_count else [0] # female_citation.append(np.mean(researcher_citation_count)) female_citation.append(citation_sum) female_index.append(int(doc['index'])) # chart = Chart() # chart.normalized_CDF(male_citation,female_citation,"citation CDF of {0}".format(discipline),"personal citation number","cumulative proportion") # chart.save(charts_path+'/{0}/citation_cdf'.format(discipline)) # chart.clear() chart = Chart() chart.unnormalized_CDF(male_citation,female_citation,"cumulative citation number of {0}".format(discipline),"personal citation number","cumulative number") chart.save(charts_path+'/{0}/unormalized_citation_cdf'.format(discipline)) chart.clear() # chart.normalized_CDF(male_index,female_index,"h-index CDF of {0}".format(discipline),"personal h-index","cumulative proportion") # chart.save(charts_path+'/{0}/h-index_cdf'.format(discipline)) # chart.clear() chart = Chart(15,10) chart.unnormalized_CDF(male_index,female_index,"cumulative h-index number of {0}".format(discipline),"personal h-index","cumulative number") chart.save(charts_path+'/{0}/unnormalized_h-index_cdf'.format(discipline)) chart.clear() chart.histogram(male_citation,female_citation,"citation distribution of {0}".format(discipline),"personal citation number","number of researcher") chart.save(charts_path+'/{0}/citation_histogram'.format(discipline)) chart.clear() chart.histogram(male_index,female_index,"h-index distribution of {0}".format(discipline),"personal h-index","number of researcher") chart.save(charts_path+'/{0}/h-index_histogram'.format(discipline)) chart.clear() chart.close()