Exemple #1
0
def analyze_indicies():
	X = [float(x)/10. for x in range(11)]
	Yh_delta = []
	Ye_delta = []
	Yh_ratio = []
	Ye_ratio = []
	
	for i,x in enumerate(X):
		print i,x
		R = validate(BEST_FILTERS,vocab_use_pct=x)
		his,true_his = R[3]
		eis,true_eis = R[4]

		Yh_delta.append(sum([float(abs(x-y)) for x,y in zip(his,true_his)]) / float(len(his)))
		Ye_delta.append(sum([float(abs(x-y)) for x,y in zip(eis,true_eis)]) / float(len(eis)))
		
		Yh_ratio.append(sum([float(x)/float(y) if y > 0 else 0 for x,y in zip(his,true_his)]) / float(len(his)))
		Ye_ratio.append(sum([float(x)/float(y) if y > 0 else 0 for x,y in zip(eis,true_eis)]) / float(len(eis)))

	fh = rb().open_data(['validation','indicies_accuracy.py'],'w')
	fh.write('X = %s\n' % str(X))
	fh.write('Yh_delta = %s\n' % str(Yh_delta))
	fh.write('Ye_delta = %s\n' % str(Ye_delta))
	fh.write('Yh_ratio = %s\n' % str(Yh_ratio))
	fh.write('Ye_ratio = %s\n' % str(Ye_ratio))
	fh.close()
Exemple #2
0
def plot_indicies():
	G = {}
	execfile(rb().data_path('validation','indicies_accuracy.py'),G)
	X = G['X']
	Yh_delta = G['Yh_delta']
	Ye_delta = G['Ye_delta']
	Yh_ratio = G['Yh_ratio']
	Ye_ratio = G['Ye_ratio']

	pl.figure()
	pl.plot(X,Yh_delta,'-ok',linewidth=2)
	pl.plot(X,Ye_delta,'--sk',linewidth=2)
	pl.grid(True)
	pl.title('Estimated-True Citation Indicies Difference')
	pl.ylabel('Avg difference between estimated and true citation indicies')
	pl.xlabel('% of vocabulary used',fontsize=14)
	pl.legend(['h-index','e-index'])
	
	pl.figure()
	pl.plot(X,Yh_ratio,'-ok',linewidth=2)
	pl.plot(X,Ye_ratio,'--sk',linewidth=2)
	bounds = pl.axis()
	#pl.axis([0.1,bounds[1],1,bounds[3]])
	pl.grid(True)
	pl.title('Citation Indicies Fraction')
	pl.ylabel('Avg ratio of estimated/true citation indicies')
	pl.xlabel('% of vocabulary used',fontsize=14)
	pl.legend(['h-index','e-index'],'lower right')
	
	pl.show()
Exemple #3
0
	def replace_fxn(name,url,dfile,top_pct):
		content = [(x[0],int(x[1])) for x in csv.reader(open(rb().data_path('validation',dfile),'r'),delimiter='\t')]
		content.sort(cmp=lambda x,y: -cmp(x[1],y[1]))
		top_num = int(math.floor(top_pct * float(len(content))))
		content = content[top_num:]
		
		return '\n'.join(['%s %d' % (x[0],x[1]) for x in content])
Exemple #4
0
def analyze_filter_correctness():
	filter_combos = [ [],[YEAR_FILTER,DUPLICATE_FILTER],[NAME_FILTER,DUPLICATE_FILTER],[VOCAB_FILTER,DUPLICATE_FILTER],[YEAR_FILTER,VOCAB_FILTER,NAME_FILTER,DUPLICATE_FILTER] ]
	fh = rb().open_data(['validation','filter_correctness.py'],'w')
	fh.write('result = [\n')
	for fc in filter_combos:
		print '=== Computing %s ===' % str(fc)
		R = validate(fc)
		fh.write('[%s,%s],\n' % (str(tuple(fc)),str(R)))
	fh.write(']')
	fh.close()
Exemple #5
0
def analyze_filter_indicies():
	filter_combos = [ [],[YEAR_FILTER,DUPLICATE_FILTER],[NAME_FILTER,DUPLICATE_FILTER],[VOCAB_FILTER,DUPLICATE_FILTER],[YEAR_FILTER,VOCAB_FILTER,NAME_FILTER,DUPLICATE_FILTER] ]
	fh = rb().open_data(['validation','filter_indicies.py'],'w')
	fh.write('result = [\n')
	for fc in filter_combos:
		print '=== Computing %s ===' % str(fc)
		R = validate(fc)
		# his,true_his = R[3]
		# eis,true_eis = R[4]
		# 
		# h_ratio = sum([float(y)/float(x) if x > 0 else 0 for x,y in zip(his,true_his)]) / float(len(his))
		# e_ratio = sum([float(y)/float(x) if x > 0 else 0 for x,y in zip(eis,true_eis)]) / float(len(eis))
		fh.write('[%s,%s],\n' % (str(tuple(fc)),str(R)))
	fh.write(']')
	fh.close()
Exemple #6
0
def plot_min_vocab_match_size():
	G = {}
	execfile(rb().data_path('validation','min_vocab_match_size.py'),G)
	X = G['X']
	Ysense = G['Ysense']
	Yspec = G['Yspec']

	pl.figure()
	pl.plot(X,Ysense,'-ok',linewidth=2)
	pl.plot(X,Yspec,'--sk',linewidth=2)
	pl.grid(True)
	pl.title('Effect of Min Vocab Match Size')
	pl.xlabel('Min # of vocab-pub matches',fontsize=14)
	pl.legend(['Sensitivity','Specificity'],'lower right')
	pl.show()
Exemple #7
0
def plot_pubs_coverage():
	G = {}
	execfile(rb().data_path('validation','pubs_coverage.py'),G)
	X = G['X']
	Ysense = G['Ysense']
	Yspec = G['Yspec']
	
	pl.figure()
	pl.plot(X,Ysense,'-ok',linewidth=2)
	pl.plot(X,Yspec,'--sk',linewidth=2)
	pl.grid(True)
	pl.title('Effect of Vocabulary Size')
	pl.xlabel('% of vocabulary used',fontsize=14)
	pl.legend(['Sensitivity','Specificity'],'lower right')
	pl.show()
Exemple #8
0
def analyze_min_vocab_match_size():
	X = range(11)
	Ysense = []
	Yspec = []

	for i,x in enumerate(X):
		print i,x
		R = validate(BEST_FILTERS,min_vocab_match_size=x)
		Ysense.append(R[0][0])
		Yspec.append(R[1][0])

	fh = rb().open_data(['validation','min_vocab_match_size.py'],'w')
	fh.write('X = %s\n' % str(X))
	fh.write('Ysense = %s\n' % str(Ysense))
	fh.write('Yspec = %s\n' % str(Yspec))
	fh.close()
Exemple #9
0
def analyze_pubs_coverage():
	X = [float(x)/10. for x in range(11)]
	Ysense = []
	Yspec = []
	
	for i,x in enumerate(X):
		print i,x
		R = validate(BEST_FILTERS,vocab_use_pct=x)
		Ysense.append(R[0][0])
		Yspec.append(R[1][0])
		
	fh = rb().open_data(['validation','pubs_coverage.py'],'w')
	fh.write('X = %s\n' % str(X))
	fh.write('Ysense = %s\n' % str(Ysense))
	fh.write('Yspec = %s\n' % str(Yspec))
	fh.close()
Exemple #10
0
def plot_true_pub_vocabulary():
	G = {}
	execfile(rb().data_path('validation','filter_true_pub_vocab.py'),G)
	R = G['result']
	X = [x[0] for x in R]
	Ysense = [x[1][0][0] for x in R]
	Yspec = [x[1][1][0] for x in R]

	pl.figure()
	pl.plot(X,Ysense,'-ok',linewidth=2)
	pl.plot(X,Yspec,'--sk',linewidth=2)
	pl.grid(True)
	pl.title('Effect of Recent Publication Loss on Accuracy')
	pl.xlabel('% most recent publications removed',fontsize=14)
	pl.legend(['Sensitivity','Specificity'],'lower right')
	pl.show()
Exemple #11
0
def analyze_true_pub_vocabulary():
	def replace_fxn(name,url,dfile,top_pct):
		content = [(x[0],int(x[1])) for x in csv.reader(open(rb().data_path('validation',dfile),'r'),delimiter='\t')]
		content.sort(cmp=lambda x,y: -cmp(x[1],y[1]))
		top_num = int(math.floor(top_pct * float(len(content))))
		content = content[top_num:]
		
		return '\n'.join(['%s %d' % (x[0],x[1]) for x in content])
		
		
	fh = rb().open_data(['validation','filter_true_pub_vocab.py'],'w')
	fh.write('result = [\n')
	X = [float(x)/10. for x in range(10)]
	for x in X:
		print '=== Computing %s ===' % str(x)
		R = validate(replace_url_fxn=lambda name,url,dfile: replace_fxn(name,url,dfile,x))
		fh.write('[%s,%s],\n' % (str(x),str(R)))
	fh.write(']')
	fh.close()	
Exemple #12
0
def plot_filter_indicies():
	G = {}
	execfile(rb().data_path('validation','filter_indicies.py'),G)
	result = G['result']

	xticks = [''] + [''.join([y[0] for y in x[0]]) for x in result] + ['']

	h_ratios = []
	e_ratios = []
	
	for fc,R in result:
		his,true_his = R[3]
		eis,true_eis = R[4]
	
		h_ratios.append( sum([float(abs(y-x))/float(y) if y > 0 else 0 for x,y in zip(his,true_his)]) / float(len(his)) )
		e_ratios.append( sum([float(abs(y-x))/float(y) if y > 0 else 0 for x,y in zip(eis,true_eis)]) / float(len(eis)) )

	# plot the sensitivity
	pl.figure()
	pl.grid(True)	
	#pl.bar(range(len(result)),[x[1][0] for x in result],ecolor='k',align='center') #,yerr=[x[1][0][1] for x in result])
	pl.bar(range(len(result)),h_ratios,ecolor='k',align='center') #,yerr=[x[1][0][1] for x in result])
	#pl.axis([-1,len(xticks)-1,0.70,1.0])
	pl.xticks(range(-1,len(xticks)),xticks)
	pl.title('H-index')
	pl.ylabel('% difference between estimated and true index',fontsize=14)

	# plot the specificity
	pl.figure()
	pl.grid(True)
	#pl.bar(range(len(result)),[x[1][1] for x in result],ecolor='k',align='center') #,yerr=[x[1][1][1] for x in result])
	pl.bar(range(len(result)),e_ratios,ecolor='k',align='center') #,yerr=[x[1][1][1] for x in result])
	#pl.axis([-1,len(xticks)-1,0.70,1.0])
	pl.xticks(range(-1,len(xticks)),xticks)
	pl.title('E-index')
	pl.ylabel('% difference between estimated and true index',fontsize=14)

	pl.show()
Exemple #13
0
def plot_filter_correctness():
	G = {}
	execfile(rb().data_path('validation','filter_correctness.py'),G)
	result = G['result']
	
	xticks = [''] + [''.join([y[0] for y in x[0]]) for x in result] + ['']
	
	# plot the sensitivity
	pl.figure()
	pl.grid(True)	
	pl.bar(range(len(result)),[x[1][0][0] for x in result],yerr=[x[1][0][1] for x in result],ecolor='k',align='center')
	pl.axis([-1,len(xticks)-1,0.95,1.0])
	pl.xticks(range(-1,len(xticks)),xticks)
	pl.title('Sensitivity')
	
	# plot the specificity
	pl.figure()
	pl.grid(True)
	pl.bar(range(len(result)),[x[1][1][0] for x in result],yerr=[x[1][1][1] for x in result],ecolor='k',align='center')
	pl.xticks(range(-1,len(xticks)),xticks)
	pl.title('Specificity')
	
	pl.show()
Exemple #14
0
def compute_test_case_stats(name,url,dfile,filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0):
	"""
	Results:
		# of total pubs found,# of found pubs,# true pubs,TP: # of matching pubs,FP, FN: # of unaccepted matching pubs
	"""
	use_initials = False
	if name.startswith('^'):
		use_initials = True
		name = name[1:]
		
	# load the true pubs
	true_pubs = []
	reader = csv.reader(open(rb().data_path('validation',dfile),'r'),delimiter='\t')
	for data in reader:
		if len(data) != 2:
			print 'ERROR:',data
		true_pubs.append(tuple([data[0],int(data[1])]))
	
	tps1995 = filter(lambda x: x[1] >= 1995,true_pubs)
	
	# get the publications for the individual
	all_pubs = obtain_individual_pubs(name,use_initials)
	pubs = list(all_pubs)
	duplicates = []
	
	# apply filters
	if YEAR_FILTER in filters:	
		pubs = year_filter(1950,pubs)
	if VOCAB_FILTER in filters:
		pubs = vocab_filter(url,pubs,min_vocab_match_size,vocab_use_pct)
	if NAME_FILTER in filters:
		pubs = name_filter(name,pubs)
	if CONFLICT_FILTER in filters:
		pubs = name_conflict_filter(name,pubs)
	if DUPLICATE_FILTER in filters:
		pubs,duplicates = duplicate_filter(pubs)
	
	rejected_pubs = filter(lambda x: x not in pubs and x not in duplicates,all_pubs)
	
	# get statistics for comparison of our method against true data
	num_found_pubs = len(pubs)
	tp = len(filter(lambda x: len(filter(lambda y: similar_titles(x.title,y[0]), true_pubs)) > 0,pubs))
	fp_list = filter(lambda x: len(filter(lambda y: similar_titles(x.title,y[0]), true_pubs)) == 0,pubs)
	#print '\n'.join(['%s, %s' % (x.title,','.join(['%s %s' % (y[0],y[1]) for y in x.authors])) for x in fp_list])
	fp = len(fp_list)
	fn_list = filter(lambda x: len(filter(lambda y: similar_titles(x.title,y[0],3), true_pubs)) > 0,rejected_pubs)
	fn = len(fn_list)
	#print '\n'.join(['%s, %s' % (x.title,','.join(['%s %s' % (y[0],y[1]) for y in x.authors])) for x in fn_list])
	
	# make the true pubs
	match_pubs = list(all_pubs)
	tpubs = []
	for p in true_pubs:
		hits = filter(lambda x: similar_titles(x.title,p[0]),match_pubs)
		hits.sort(cmp=lambda x,y: -cmp(x.cites,y.cites))
		
		if len(hits) > 0:
			hit = hits[0]
			match_pubs.remove(hit)
			tpubs.append(Publication(title=p[0],year=p[1],cites=hit.cites))
	
	#print '%d (%d) %d (%d) %d (%d)' % (pubstats.h_index(tpubs),len(tpubs),pubstats.h_index(pubs),len(pubs),pubstats.h_index(all_pubs),len(all_pubs))
	
	return all_pubs,pubs,tpubs,len(all_pubs),num_found_pubs,len(true_pubs),len(tps1995),tp,fp,fn
Exemple #15
0
def validate(filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0,replace_url_fxn=None):
	"""
	Run Topp on the test data using the filters and parameters specified (filters, min_vocab_match_size,vocab_use_pct).
	
	replace_url_fxn(name,url,data) is a function that produces text content that should be used in place of the individual's URL.
	The output should be a string.
	"""
	tmp_path = rb().data_path('validation','__TMP__REPLACE_CONTENT__.txt')
	tmp_url = 'file://%s' % tmp_path
	
	reader = csv.reader(rb().open_data(['validation','batch.txt'],'r'),delimiter=',')
	
	senses = []
	specs = []
	fp_pcts = []
	hindicies = []
	true_hindicies = []
	eindicies = []
	true_eindicies = []
	
	#print 'All_pubs Est_Pubs True_Pubs True_Pubs_1995 TP FP FN'
	#print 'Name\tSense\t\tSpec\t\t%FP'
	num_records = 0
	for data in reader:
		num_records += 1
		if replace_url_fxn is None:
			all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*(list(data) + [filters,min_vocab_match_size,vocab_use_pct]))
		else:
			# get the new content
			content = replace_url_fxn(*data)
			
			# write the content to the temporary file
			fh = open(tmp_path,'w')
			fh.write(content)
			fh.close()
			
			all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*([data[0],tmp_url,data[2]] + [filters,min_vocab_match_size,vocab_use_pct]))
			
		hindicies.append(pubstats.h_index(est_pubs))
		eindicies.append(pubstats.e_index(est_pubs))
		true_hindicies.append(pubstats.h_index(true_pubs))
		true_eindicies.append(pubstats.e_index(true_pubs))
		tn = num_pubs - tp - fp - fn
		sense = sensitivity(tp,fn)
		senses.append(sense)
		spec = specificity(tn,fp)
		specs.append(spec)
		fp_pct = float(fp) / float(tp + fp) if (tp + fp) != 0 else float('infinity')
		fp_pcts.append(fp_pct)
		
		names = data[0].split()
		#print names[0][0] + ' ' + names[-1][:4],'\t','%f\t%f\t%f' % (sense,spec,fp_pct)
	
	avg_sense = sum(senses) / float(num_records)
	sdv_sense = sum([math.pow(x-avg_sense,2) for x in senses]) / float(num_records)
	avg_spec = sum(specs) / float(num_records)
	sdv_spec = sum([math.pow(x-avg_spec,2) for x in specs]) / float(num_records)
	avg_fp_pct = sum(fp_pcts) / float(num_records)
	sdv_fp_pct = sum([math.pow(x-avg_fp_pct,2) for x in fp_pcts]) / float(num_records)
	
	print '='*80
	print 'AVG\t%f\t%f\t%f' % (avg_sense,avg_spec,avg_fp_pct)
	print 'SDEV\t%f\t%f\t%f' % (sdv_sense,sdv_spec,sdv_fp_pct)
	
	return (avg_sense,sdv_sense),(avg_spec,sdv_spec),(avg_fp_pct,sdv_fp_pct),(hindicies,true_hindicies),(eindicies,true_eindicies)