コード例 #1
0
def h_index_over_time():
    def calculate_h_index(pubs, year):
        pubs = list(
            filter(
                lambda pub: pub['year'].isdigit() and pub['citation'].isdigit(
                ) and int(pub['year']) <= year, pubs))
        h_index = 0
        for index, pub in enumerate(pubs):
            if int(pub['citation']) >= index:
                h_index = index
            else:
                break
        return h_index

    # researcher.aggregate([
    # 	{'$match':{'gender':{'$exists':1}}},
    # 	{'pubs.year':{'$gt'}}
    # 	])
    # print(len(docs))
    h_index_arr = []
    start = 1980
    end = 2018
    for year in range(start, end):
        docs = researcher.find({'gender': {'$exists': 1}})
        h_indexs = [calculate_h_index(doc['pubs'], year) for doc in docs]
        print(sum(h_indexs))
        h_index_arr.append(sum(h_indexs))

    chart = Chart(30, 30)
    labels = [str(year) for year in range(start, end)]
    chart.bar(h_index_arr, end - start, labels, 'h-index change over time',
              'time', 'cumulative h_index', True, False)
    chart.save(charts_path + '/h_index_over_time.eps')
    chart.clear()
    chart.close()
コード例 #2
0
def gender_favorite(top_k, sex='M'):
    docs = table.aggregate([{
        '$match': {
            'gender': sex
        }
    }, {
        '$unwind': '$labels'
    }, {
        '$group': {
            '_id': {
                'label': '$labels'
            },
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }])
    number_arr = []
    count_arr = []
    labels = []
    docs = [doc for doc in docs]
    for doc in docs[:top_k]:
        count_arr.append(doc['count'])
        labels.append(doc['_id']['label'])

    chart = Chart(100, 180)
    chart.bar(count_arr,
              top_k,
              labels,
              "The Top {0} females' favorite disciplines".format(top_k),
              'discipline',
              'researcher number',
              True,
              log=False,
              fontsize=120)
    chart.save(chart_path + '/{1}_favorite_{0}'.format(top_k, sex),
               format='eps')
    chart.clear()
コード例 #3
0
def declare_prefrence_distribution(discipline='all researchers'):

	male_declare = researcher.count({'gender':'M','labels.0':{'$exists':1}})
	female_declare = researcher.count({'gender':'F','labels.0':{'$exists':1}})

	female_not_declare = researcher.count({'gender':'F','labels.0':{'$exists':0}})
	male_not_declare = researcher.count({'gender':'M','labels.0':{'$exists':0}})

	chart = Chart()

	chart.pie([female_not_declare,female_declare],'Females\' preference in declaration of research domain',['not declared','declared'])
	chart.save(charts_path+'/{0}/female_declare_preference'.format(discipline))
	chart.clear()

	chart.close()

	chart = Chart()
	chart.pie([male_not_declare,male_declare],'Males\' prefrence in declaration research of domain',['not declared','declared'])
	chart.save(charts_path+'/{0}/male_declare_preference'.format(discipline))
	chart.clear()

	chart.close()
コード例 #4
0
def year_distribution(start, end):
    os.mkdir(chart_path) if not os.path.exists(chart_path) else ''
    pubs_per_year = [0 for x in range(start, end)]
    citation_per_year = [0 for x in range(start, end)]
    # print(pubs_per_year)
    year_labels = [str(x) for x in range(start, end)]
    docs = copy.find({}, {'pubs': 1})
    for doc in docs:
        for pub in doc['pubs']:
            if pub['year'] == '':
                continue
            year = int(pub['year'])
            index = year - start
            if 0 > index or index >= end - start:
                continue
            try:
                pubs_per_year[index] += 1
                if pub['citation'].isdigit():
                    citation_per_year[index] += int(pub['citation'])
            except Exception as e:
                print(pub['citation'])
                print(len(pub['citation']))
                print(index)
                sys.exit()

    # pubs_per_year = [year/1000 for year in pubs_per_year]
    chart = Chart(50, 50)
    chart.bar(citation_per_year,
              end - start,
              year_labels,
              'publication number per year',
              'year',
              'number',
              log=False,
              fontsize=10)
    # chart.bar(pubs_per_year,end-start,year_labels,'publication number per year','year','number',log=False,fontsize=10)
    chart.save('pub_per_year', format='png')
    chart.show()
コード例 #5
0
def gender_ratio(top_k):
	top_n = researcher.aggregate([
		{'$match':{'gender':{'$exists':1}}},
		{'$unwind':'$labels'},
		{'$group':{
		'_id':{'label':'$labels'},
		'count':{'$sum':1}
		}},
		{'$sort':{'count':-1}}])
	label_ratios = []
	labels = []
	ratios = []
	top_n = [x['_id']['label'] for x in top_n][:top_k]
	for discipline in top_n:
		male_size = researcher.count({'gender':'M','labels':discipline})
		female_size = researcher.count({'gender':'F','labels':discipline})
		# ratio = female_size/male_size
		ratio = male_size/female_size
		label_ratios.append((discipline,ratio))
		labels.append(discipline)
		ratios.append(ratio)

	chart = Chart(top_k/3,top_k/2)
	chart.bar(ratios,top_k,labels,'Gender Ratio in Top %s Disciplines'%top_k,'discipline','ratio',True,log=False,fontsize=top_k/2)
	chart.save(charts_path+'/gender_ratio_top_{0}.eps'.format(top_k))
	chart.clear()
	chart.close()

	label_ratios.sort(key=lambda x:x[1],reverse=True)
	labels = [x[0] for x in label_ratios]
	ratios = [x[1] for x in label_ratios]

	chart = Chart(100,180)
	chart.bar(ratios,top_k,labels,'Ranked Gender Ratio in Top %s Disciplines'%top_k,'discipline','ratio',True,log=False,fontsize=120)
	chart.save(charts_path+'/ranked_gender_ratio_top_{0}'.format(top_k),format='eps')
	chart.clear()
	chart.close()
コード例 #6
0
def pubs_distribution(discipline='all researchers'):
	os.mkdir(charts_path+'/'+discipline) if not os.path.exists(charts_path+'/'+discipline) else ''
	male = researcher.aggregate([{'$match':{'gender':'M','labels':discipline}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}])if discipline != 'all researchers' else researcher.aggregate([{'$match':{'gender':'M'}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}])
	female = researcher.aggregate([{'$match':{'gender':'F','labels':discipline}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}]) if discipline !='all researchers' else researcher.aggregate([{'$match':{'gender':'F'}},{'$project':{'_id':'$_id','pubs_count':{'$size':'$pubs'}}},{'$sort':{'pubs_count':1}}])

	male = [doc['pubs_count'] for doc in male]
	female = [doc['pubs_count'] for doc in female]

	chart = Chart()
	# chart.normalized_CDF(male,female,"publication CDF of {0}".format(discipline),"personal publication number","cumulative proportion",15)
	# chart.save(charts_path+'/{0}/publication_cdf'.format(discipline),format='png')
	# chart.clear()

	chart.histogram(male,female,"publication distribution of {0} ".format(discipline),"personal publication number","research count",15)

	chart.save(charts_path+'/{0}/publication_histogram'.format(discipline),format='png')
	chart.clear()

	chart = Chart(15,10)
	chart.unnormalized_CDF(male,female,"cumulative publication number of {0}".format(discipline),"personal publication number","cumulative publication",15)
	chart.save(charts_path+'/{0}/unnormalized_publication_cdf'.format(discipline),format='png')
	chart.clear()

	chart.close()
コード例 #7
0
def discipline_proportion(top_k):
    docs = table.aggregate([{
        '$match': {
            'gender': {
                '$exists': 1
            }
        }
    }, {
        '$unwind': '$labels'
    }, {
        '$group': {
            '_id': {
                'label': '$labels'
            },
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }])

    docs = [doc for doc in docs]
    # print(docs[:10])
    total = table.count({'gender': {'$exists': 1}})
    count_arr = [doc['count'] for doc in docs[:top_k]]
    proportion_arr = [doc['count'] / total for doc in docs[:top_k]]

    cumulative_arr = []
    c = 0
    for i in proportion_arr:
        c += i
        cumulative_arr.append(c)

    labels = [doc['_id']['label'] for doc in docs[:top_k]]

    # chart = Chart()
    # print(len(labels))
    # print(len(arr))
    # chart.pie([arr],'test',labels)
    # chart.show()
    # chart.single_unnomarlized_CDF(arr,'disciplines CDF','disciplines','percentage')
    # chart.save(chart_path+'cdf.eps')

    # s = ''
    # print(np.median())
    # for label in labels:
    # 	s = s+label+', '
    # print(s)

    # os.mkdir(chart_path) if not os.path.exists(chart_path) else ''
    chart = Chart(100, 150)
    # chart.bar(count_arr,top_k,labels,'The Top {0} popular disciplines'.format(top_k),'discipline','researcher number',True,log=False,fontsize=100)
    # chart.show()
    # chart.save(chart_path+'/number_{0}'.format(top_k),format='eps')
    # chart.clear()

    chart.bar(cumulative_arr,
              top_k,
              labels,
              'Cumulative propotion of most popular disciplines',
              'discipline',
              'propotion',
              True,
              log=False,
              fontsize=100)
    chart.save(chart_path + '/cumulative_{0}'.format(top_k), format='eps')
    chart.clear()
コード例 #8
0
def citation_and_index_distribution(discipline='all researchers'):

	docs = researcher.find({'gender':{'$exists':1},'labels':discipline},{'gender':1,'pubs':1,'index':1}) if discipline !='all researchers' else researcher.find({'gender':{'$exists':1}},{'gender':1,'pubs':1,'index':1})
	male_citation = []
	female_citation = []
	male_index = []
	female_index = []

	pub_not_given_ciatation = 0
	profile_not_given_index = 0

	for doc in docs:
		researcher_citation_count = []
		citation_sum = 0
		if doc['gender'] == 'M':
			for pub in doc['pubs']:
				if pub['citation'].isdigit():
					citation_sum+=int(pub['citation'])
					# researcher_citation_count.append(int(pub['citation']))
				else:
					pub_not_given_ciatation+=1
			# researcher_citation_count = researcher_citation_count if researcher_citation_count else [0]
			# male_citation.append(np.mean(researcher_citation_count))
			male_citation.append(citation_sum)
			male_index.append(int(doc['index']))
		else:
			for pub in doc['pubs']:
				if pub['citation'].isdigit():
					citation_sum+=int(pub['citation'])
					# researcher_citation_count.append(int(pub['citation']))
				else:
					pub_not_given_ciatation+=1
			# researcher_citation_count = researcher_citation_count if researcher_citation_count else [0]
			# female_citation.append(np.mean(researcher_citation_count))
			female_citation.append(citation_sum)
			female_index.append(int(doc['index']))

	# chart = Chart()
	# chart.normalized_CDF(male_citation,female_citation,"citation CDF of {0}".format(discipline),"personal citation number","cumulative proportion")
	# chart.save(charts_path+'/{0}/citation_cdf'.format(discipline))
	# chart.clear()

	chart = Chart()
	chart.unnormalized_CDF(male_citation,female_citation,"cumulative citation number of {0}".format(discipline),"personal citation number","cumulative number")
	chart.save(charts_path+'/{0}/unormalized_citation_cdf'.format(discipline))
	chart.clear()

	# chart.normalized_CDF(male_index,female_index,"h-index CDF of {0}".format(discipline),"personal h-index","cumulative proportion")
	# chart.save(charts_path+'/{0}/h-index_cdf'.format(discipline))
	# chart.clear()

	chart = Chart(15,10)
	chart.unnormalized_CDF(male_index,female_index,"cumulative h-index number of {0}".format(discipline),"personal h-index","cumulative number")
	chart.save(charts_path+'/{0}/unnormalized_h-index_cdf'.format(discipline))
	chart.clear()

	chart.histogram(male_citation,female_citation,"citation distribution of {0}".format(discipline),"personal citation number","number of researcher")
	chart.save(charts_path+'/{0}/citation_histogram'.format(discipline))
	chart.clear()

	chart.histogram(male_index,female_index,"h-index distribution of {0}".format(discipline),"personal h-index","number of researcher")
	chart.save(charts_path+'/{0}/h-index_histogram'.format(discipline))
	chart.clear()

	chart.close()