def analyze_indicies(): X = [float(x)/10. for x in range(11)] Yh_delta = [] Ye_delta = [] Yh_ratio = [] Ye_ratio = [] for i,x in enumerate(X): print i,x R = validate(BEST_FILTERS,vocab_use_pct=x) his,true_his = R[3] eis,true_eis = R[4] Yh_delta.append(sum([float(abs(x-y)) for x,y in zip(his,true_his)]) / float(len(his))) Ye_delta.append(sum([float(abs(x-y)) for x,y in zip(eis,true_eis)]) / float(len(eis))) Yh_ratio.append(sum([float(x)/float(y) if y > 0 else 0 for x,y in zip(his,true_his)]) / float(len(his))) Ye_ratio.append(sum([float(x)/float(y) if y > 0 else 0 for x,y in zip(eis,true_eis)]) / float(len(eis))) fh = rb().open_data(['validation','indicies_accuracy.py'],'w') fh.write('X = %s\n' % str(X)) fh.write('Yh_delta = %s\n' % str(Yh_delta)) fh.write('Ye_delta = %s\n' % str(Ye_delta)) fh.write('Yh_ratio = %s\n' % str(Yh_ratio)) fh.write('Ye_ratio = %s\n' % str(Ye_ratio)) fh.close()
def plot_indicies(): G = {} execfile(rb().data_path('validation','indicies_accuracy.py'),G) X = G['X'] Yh_delta = G['Yh_delta'] Ye_delta = G['Ye_delta'] Yh_ratio = G['Yh_ratio'] Ye_ratio = G['Ye_ratio'] pl.figure() pl.plot(X,Yh_delta,'-ok',linewidth=2) pl.plot(X,Ye_delta,'--sk',linewidth=2) pl.grid(True) pl.title('Estimated-True Citation Indicies Difference') pl.ylabel('Avg difference between estimated and true citation indicies') pl.xlabel('% of vocabulary used',fontsize=14) pl.legend(['h-index','e-index']) pl.figure() pl.plot(X,Yh_ratio,'-ok',linewidth=2) pl.plot(X,Ye_ratio,'--sk',linewidth=2) bounds = pl.axis() #pl.axis([0.1,bounds[1],1,bounds[3]]) pl.grid(True) pl.title('Citation Indicies Fraction') pl.ylabel('Avg ratio of estimated/true citation indicies') pl.xlabel('% of vocabulary used',fontsize=14) pl.legend(['h-index','e-index'],'lower right') pl.show()
def replace_fxn(name,url,dfile,top_pct): content = [(x[0],int(x[1])) for x in csv.reader(open(rb().data_path('validation',dfile),'r'),delimiter='\t')] content.sort(cmp=lambda x,y: -cmp(x[1],y[1])) top_num = int(math.floor(top_pct * float(len(content)))) content = content[top_num:] return '\n'.join(['%s %d' % (x[0],x[1]) for x in content])
def analyze_filter_correctness(): filter_combos = [ [],[YEAR_FILTER,DUPLICATE_FILTER],[NAME_FILTER,DUPLICATE_FILTER],[VOCAB_FILTER,DUPLICATE_FILTER],[YEAR_FILTER,VOCAB_FILTER,NAME_FILTER,DUPLICATE_FILTER] ] fh = rb().open_data(['validation','filter_correctness.py'],'w') fh.write('result = [\n') for fc in filter_combos: print '=== Computing %s ===' % str(fc) R = validate(fc) fh.write('[%s,%s],\n' % (str(tuple(fc)),str(R))) fh.write(']') fh.close()
def analyze_filter_indicies(): filter_combos = [ [],[YEAR_FILTER,DUPLICATE_FILTER],[NAME_FILTER,DUPLICATE_FILTER],[VOCAB_FILTER,DUPLICATE_FILTER],[YEAR_FILTER,VOCAB_FILTER,NAME_FILTER,DUPLICATE_FILTER] ] fh = rb().open_data(['validation','filter_indicies.py'],'w') fh.write('result = [\n') for fc in filter_combos: print '=== Computing %s ===' % str(fc) R = validate(fc) # his,true_his = R[3] # eis,true_eis = R[4] # # h_ratio = sum([float(y)/float(x) if x > 0 else 0 for x,y in zip(his,true_his)]) / float(len(his)) # e_ratio = sum([float(y)/float(x) if x > 0 else 0 for x,y in zip(eis,true_eis)]) / float(len(eis)) fh.write('[%s,%s],\n' % (str(tuple(fc)),str(R))) fh.write(']') fh.close()
def plot_min_vocab_match_size(): G = {} execfile(rb().data_path('validation','min_vocab_match_size.py'),G) X = G['X'] Ysense = G['Ysense'] Yspec = G['Yspec'] pl.figure() pl.plot(X,Ysense,'-ok',linewidth=2) pl.plot(X,Yspec,'--sk',linewidth=2) pl.grid(True) pl.title('Effect of Min Vocab Match Size') pl.xlabel('Min # of vocab-pub matches',fontsize=14) pl.legend(['Sensitivity','Specificity'],'lower right') pl.show()
def plot_pubs_coverage(): G = {} execfile(rb().data_path('validation','pubs_coverage.py'),G) X = G['X'] Ysense = G['Ysense'] Yspec = G['Yspec'] pl.figure() pl.plot(X,Ysense,'-ok',linewidth=2) pl.plot(X,Yspec,'--sk',linewidth=2) pl.grid(True) pl.title('Effect of Vocabulary Size') pl.xlabel('% of vocabulary used',fontsize=14) pl.legend(['Sensitivity','Specificity'],'lower right') pl.show()
def analyze_min_vocab_match_size(): X = range(11) Ysense = [] Yspec = [] for i,x in enumerate(X): print i,x R = validate(BEST_FILTERS,min_vocab_match_size=x) Ysense.append(R[0][0]) Yspec.append(R[1][0]) fh = rb().open_data(['validation','min_vocab_match_size.py'],'w') fh.write('X = %s\n' % str(X)) fh.write('Ysense = %s\n' % str(Ysense)) fh.write('Yspec = %s\n' % str(Yspec)) fh.close()
def analyze_pubs_coverage(): X = [float(x)/10. for x in range(11)] Ysense = [] Yspec = [] for i,x in enumerate(X): print i,x R = validate(BEST_FILTERS,vocab_use_pct=x) Ysense.append(R[0][0]) Yspec.append(R[1][0]) fh = rb().open_data(['validation','pubs_coverage.py'],'w') fh.write('X = %s\n' % str(X)) fh.write('Ysense = %s\n' % str(Ysense)) fh.write('Yspec = %s\n' % str(Yspec)) fh.close()
def plot_true_pub_vocabulary(): G = {} execfile(rb().data_path('validation','filter_true_pub_vocab.py'),G) R = G['result'] X = [x[0] for x in R] Ysense = [x[1][0][0] for x in R] Yspec = [x[1][1][0] for x in R] pl.figure() pl.plot(X,Ysense,'-ok',linewidth=2) pl.plot(X,Yspec,'--sk',linewidth=2) pl.grid(True) pl.title('Effect of Recent Publication Loss on Accuracy') pl.xlabel('% most recent publications removed',fontsize=14) pl.legend(['Sensitivity','Specificity'],'lower right') pl.show()
def analyze_true_pub_vocabulary(): def replace_fxn(name,url,dfile,top_pct): content = [(x[0],int(x[1])) for x in csv.reader(open(rb().data_path('validation',dfile),'r'),delimiter='\t')] content.sort(cmp=lambda x,y: -cmp(x[1],y[1])) top_num = int(math.floor(top_pct * float(len(content)))) content = content[top_num:] return '\n'.join(['%s %d' % (x[0],x[1]) for x in content]) fh = rb().open_data(['validation','filter_true_pub_vocab.py'],'w') fh.write('result = [\n') X = [float(x)/10. for x in range(10)] for x in X: print '=== Computing %s ===' % str(x) R = validate(replace_url_fxn=lambda name,url,dfile: replace_fxn(name,url,dfile,x)) fh.write('[%s,%s],\n' % (str(x),str(R))) fh.write(']') fh.close()
def plot_filter_indicies(): G = {} execfile(rb().data_path('validation','filter_indicies.py'),G) result = G['result'] xticks = [''] + [''.join([y[0] for y in x[0]]) for x in result] + [''] h_ratios = [] e_ratios = [] for fc,R in result: his,true_his = R[3] eis,true_eis = R[4] h_ratios.append( sum([float(abs(y-x))/float(y) if y > 0 else 0 for x,y in zip(his,true_his)]) / float(len(his)) ) e_ratios.append( sum([float(abs(y-x))/float(y) if y > 0 else 0 for x,y in zip(eis,true_eis)]) / float(len(eis)) ) # plot the sensitivity pl.figure() pl.grid(True) #pl.bar(range(len(result)),[x[1][0] for x in result],ecolor='k',align='center') #,yerr=[x[1][0][1] for x in result]) pl.bar(range(len(result)),h_ratios,ecolor='k',align='center') #,yerr=[x[1][0][1] for x in result]) #pl.axis([-1,len(xticks)-1,0.70,1.0]) pl.xticks(range(-1,len(xticks)),xticks) pl.title('H-index') pl.ylabel('% difference between estimated and true index',fontsize=14) # plot the specificity pl.figure() pl.grid(True) #pl.bar(range(len(result)),[x[1][1] for x in result],ecolor='k',align='center') #,yerr=[x[1][1][1] for x in result]) pl.bar(range(len(result)),e_ratios,ecolor='k',align='center') #,yerr=[x[1][1][1] for x in result]) #pl.axis([-1,len(xticks)-1,0.70,1.0]) pl.xticks(range(-1,len(xticks)),xticks) pl.title('E-index') pl.ylabel('% difference between estimated and true index',fontsize=14) pl.show()
def plot_filter_correctness(): G = {} execfile(rb().data_path('validation','filter_correctness.py'),G) result = G['result'] xticks = [''] + [''.join([y[0] for y in x[0]]) for x in result] + [''] # plot the sensitivity pl.figure() pl.grid(True) pl.bar(range(len(result)),[x[1][0][0] for x in result],yerr=[x[1][0][1] for x in result],ecolor='k',align='center') pl.axis([-1,len(xticks)-1,0.95,1.0]) pl.xticks(range(-1,len(xticks)),xticks) pl.title('Sensitivity') # plot the specificity pl.figure() pl.grid(True) pl.bar(range(len(result)),[x[1][1][0] for x in result],yerr=[x[1][1][1] for x in result],ecolor='k',align='center') pl.xticks(range(-1,len(xticks)),xticks) pl.title('Specificity') pl.show()
def compute_test_case_stats(name,url,dfile,filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0): """ Results: # of total pubs found,# of found pubs,# true pubs,TP: # of matching pubs,FP, FN: # of unaccepted matching pubs """ use_initials = False if name.startswith('^'): use_initials = True name = name[1:] # load the true pubs true_pubs = [] reader = csv.reader(open(rb().data_path('validation',dfile),'r'),delimiter='\t') for data in reader: if len(data) != 2: print 'ERROR:',data true_pubs.append(tuple([data[0],int(data[1])])) tps1995 = filter(lambda x: x[1] >= 1995,true_pubs) # get the publications for the individual all_pubs = obtain_individual_pubs(name,use_initials) pubs = list(all_pubs) duplicates = [] # apply filters if YEAR_FILTER in filters: pubs = year_filter(1950,pubs) if VOCAB_FILTER in filters: pubs = vocab_filter(url,pubs,min_vocab_match_size,vocab_use_pct) if NAME_FILTER in filters: pubs = name_filter(name,pubs) if CONFLICT_FILTER in filters: pubs = name_conflict_filter(name,pubs) if DUPLICATE_FILTER in filters: pubs,duplicates = duplicate_filter(pubs) rejected_pubs = filter(lambda x: x not in pubs and x not in duplicates,all_pubs) # get statistics for comparison of our method against true data num_found_pubs = len(pubs) tp = len(filter(lambda x: len(filter(lambda y: similar_titles(x.title,y[0]), true_pubs)) > 0,pubs)) fp_list = filter(lambda x: len(filter(lambda y: similar_titles(x.title,y[0]), true_pubs)) == 0,pubs) #print '\n'.join(['%s, %s' % (x.title,','.join(['%s %s' % (y[0],y[1]) for y in x.authors])) for x in fp_list]) fp = len(fp_list) fn_list = filter(lambda x: len(filter(lambda y: similar_titles(x.title,y[0],3), true_pubs)) > 0,rejected_pubs) fn = len(fn_list) #print '\n'.join(['%s, %s' % (x.title,','.join(['%s %s' % (y[0],y[1]) for y in x.authors])) for x in fn_list]) # make the true pubs match_pubs = list(all_pubs) tpubs = [] for p in true_pubs: hits = filter(lambda x: similar_titles(x.title,p[0]),match_pubs) hits.sort(cmp=lambda x,y: -cmp(x.cites,y.cites)) if len(hits) > 0: hit = hits[0] match_pubs.remove(hit) tpubs.append(Publication(title=p[0],year=p[1],cites=hit.cites)) #print '%d (%d) %d (%d) %d (%d)' % (pubstats.h_index(tpubs),len(tpubs),pubstats.h_index(pubs),len(pubs),pubstats.h_index(all_pubs),len(all_pubs)) return all_pubs,pubs,tpubs,len(all_pubs),num_found_pubs,len(true_pubs),len(tps1995),tp,fp,fn
def validate(filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0,replace_url_fxn=None): """ Run Topp on the test data using the filters and parameters specified (filters, min_vocab_match_size,vocab_use_pct). replace_url_fxn(name,url,data) is a function that produces text content that should be used in place of the individual's URL. The output should be a string. """ tmp_path = rb().data_path('validation','__TMP__REPLACE_CONTENT__.txt') tmp_url = 'file://%s' % tmp_path reader = csv.reader(rb().open_data(['validation','batch.txt'],'r'),delimiter=',') senses = [] specs = [] fp_pcts = [] hindicies = [] true_hindicies = [] eindicies = [] true_eindicies = [] #print 'All_pubs Est_Pubs True_Pubs True_Pubs_1995 TP FP FN' #print 'Name\tSense\t\tSpec\t\t%FP' num_records = 0 for data in reader: num_records += 1 if replace_url_fxn is None: all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*(list(data) + [filters,min_vocab_match_size,vocab_use_pct])) else: # get the new content content = replace_url_fxn(*data) # write the content to the temporary file fh = open(tmp_path,'w') fh.write(content) fh.close() all_pubs,est_pubs,true_pubs,num_pubs,num_est_pubs,num_true_pubs,num_true_1995pubs,tp,fp,fn = compute_test_case_stats(*([data[0],tmp_url,data[2]] + [filters,min_vocab_match_size,vocab_use_pct])) hindicies.append(pubstats.h_index(est_pubs)) eindicies.append(pubstats.e_index(est_pubs)) true_hindicies.append(pubstats.h_index(true_pubs)) true_eindicies.append(pubstats.e_index(true_pubs)) tn = num_pubs - tp - fp - fn sense = sensitivity(tp,fn) senses.append(sense) spec = specificity(tn,fp) specs.append(spec) fp_pct = float(fp) / float(tp + fp) if (tp + fp) != 0 else float('infinity') fp_pcts.append(fp_pct) names = data[0].split() #print names[0][0] + ' ' + names[-1][:4],'\t','%f\t%f\t%f' % (sense,spec,fp_pct) avg_sense = sum(senses) / float(num_records) sdv_sense = sum([math.pow(x-avg_sense,2) for x in senses]) / float(num_records) avg_spec = sum(specs) / float(num_records) sdv_spec = sum([math.pow(x-avg_spec,2) for x in specs]) / float(num_records) avg_fp_pct = sum(fp_pcts) / float(num_records) sdv_fp_pct = sum([math.pow(x-avg_fp_pct,2) for x in fp_pcts]) / float(num_records) print '='*80 print 'AVG\t%f\t%f\t%f' % (avg_sense,avg_spec,avg_fp_pct) print 'SDEV\t%f\t%f\t%f' % (sdv_sense,sdv_spec,sdv_fp_pct) return (avg_sense,sdv_sense),(avg_spec,sdv_spec),(avg_fp_pct,sdv_fp_pct),(hindicies,true_hindicies),(eindicies,true_eindicies)