def bc3_eval(): corpus = 'bc3/bc3corpus.1.0/corpus.xml' annotation = 'bc3/bc3corpus.1.0/annotation.xml' mails = parse_file(corpus) mails = parse_anno(annotation,mails) sample_vector = [] target_vector = [] precision_vector = [] recall_vector = [] F_measure_vector = [] imp = 0 ti_s = 0 to_s = 0 train = 20 index =0 for i in mails: if(index>=train): break index += 1 vec,important,title_sim = process_mail(i) topic_similarity = lda_process(vec) phrase_list,vec_phrase_index = phrase_extraction(vec) word_list = word_score_calculate(phrase_list) vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index) tmp_produce = [] standard_summary = [anno(i)] standard_name = [[]] cue_word_score = cue_word(vec) for j in range(len(standard_summary[0])): standard_name[0].append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt') #print(standard_name) #standard_name[0].append('PythonROUGE/'+i.name+'/'+str(0)+'_standard.txt') #print(standard_name) #standard_name = [['PythonROUGE/'+i+'_standard.txt']] newpath = 'PythonROUGE/'+i.name if not os.path.exists(newpath): os.makedirs(newpath) for j in range(len(vec)): produce_name = ['PythonROUGE/'+i.name+'/'+str(j)+'.txt'] produce_summary = [[vec[j].sentence]] sample_vector.append([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]]) imp += important[j] ti_s += title_sim[j] to_s += topic_similarity[j] recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name) target_vector.append(recall[0]*precision[0]*F_measure[0]) recall_vector.append(recall[0]) precision_vector.append(precision[0]) F_measure_vector.append(F_measure[0]) shutil.rmtree(newpath) for i in sample_vector: i[0] = i[0]/imp i[1] = i[1]/ti_s i[2] = i[2]/to_s x_recall_train, x_recall_test, y_recall_train, y_recall_test = cross_validation.train_test_split(sample_vector,recall_vector, test_size=0.2, random_state=0) x_precision_train, x_precision_test, y_precision_train, y_precision_test = cross_validation.train_test_split(sample_vector,precision_vector, test_size=0.2, random_state=0) x_fmeasure_train, x_fmeasure_test, y_fmeasure_train, y_fmeasure_test = cross_validation.train_test_split(sample_vector,F_measure_vector, test_size=0.2, random_state=0) #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3, 1e-4,1e-5], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] r_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5) p_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5) f_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5) r_clf.fit(sample_vector,recall_vector) p_clf.fit(sample_vector,precision_vector) f_clf.fit(sample_vector,F_measure_vector) index = 0 produce_summary = [] produce_name = [] standard_summary = [] standard_name = [] lex_summary = [] lex_name = [] for i in mails: if(index<train): index += 1 continue if(i.name == 'Re:_StarOffice' or i.name == 'Try_Unsubscribing––You_Can\'t'): continue #print(i.name) vec,important,title_sim = process_mail(i) topic_similarity = lda_process(vec) phrase_list,vec_phrase_index = phrase_extraction(vec) word_list = word_score_calculate(phrase_list) vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index) cue_word_score = cue_word(vec) #word_list = word_score_calculate(phrase_extraction(vec)) #print(word_list) tmp_produce = [] standard_summary.append(anno(i)) tmp_name = [] for j in range(len(standard_summary[-1])): tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt') standard_name.append(tmp_name) newpath = 'PythonROUGE/'+i.name if not os.path.exists(newpath): os.makedirs(newpath) maxs = 0 maxi = 0 tmp_summary = [] predict_rouge = [] for j in range(len(vec)): #tmp = r_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*p_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]]) tmp = f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]]) predict_rouge.append(tmp) sort_index = numpy.argsort(predict_rouge)[::-1] sort_index2 = numpy.argsort(important)[::-1] ''' for j in range(10): tmp_summary.append(vec[sort_index[j]].sentence) tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt') ''' lex_summary.append(vec[sort_index2[0]].sentence) tmp_summary.append(vec[sort_index[0]].sentence) produce_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt') lex_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_lex'+'.txt') produce_summary.append(tmp_summary) #print(standard_name) recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name) print('recall:') print(recall) print('precision:') print(precision) print('F_measure:') print(F_measure) recall,precision,F_measure = summary_eval(standard_summary,standard_name,lex_summary,lex_name) print('lex_recall:') print(recall) print('lex_precision:') print(precision) print('lex_F_measure:') print(F_measure) return f_clf
def parse_testset(self, testset_file): return xmlparser.parse_file(main=self, parent=self, file=testset_file)
import xmlparser import os schema = xmlparser.parse_file(os.path.join(os.path.split(__file__)[0], "default.xml"))