def bc3_eval():
	corpus = 'bc3/bc3corpus.1.0/corpus.xml'	
	annotation = 'bc3/bc3corpus.1.0/annotation.xml'
	mails = parse_file(corpus)
	mails = parse_anno(annotation,mails)
	sample_vector = []
        target_vector = []
        precision_vector = []
        recall_vector = []
        F_measure_vector = []
	imp = 0
        ti_s = 0
        to_s = 0
	train = 20
	index =0
	for i in mails:
		if(index>=train):
			break
		index += 1
		vec,important,title_sim = process_mail(i)
		topic_similarity = lda_process(vec)
		phrase_list,vec_phrase_index = phrase_extraction(vec)
		word_list = word_score_calculate(phrase_list)
		vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index)
		tmp_produce = []
		standard_summary = [anno(i)]
		standard_name = [[]]
		cue_word_score = cue_word(vec)
		for j in range(len(standard_summary[0])):
			standard_name[0].append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt')
		#print(standard_name)
		#standard_name[0].append('PythonROUGE/'+i.name+'/'+str(0)+'_standard.txt')
		#print(standard_name)
                #standard_name = [['PythonROUGE/'+i+'_standard.txt']]
                newpath = 'PythonROUGE/'+i.name
                if not os.path.exists(newpath):
                        os.makedirs(newpath)
                for j in range(len(vec)):
                        produce_name = ['PythonROUGE/'+i.name+'/'+str(j)+'.txt']
                        produce_summary = [[vec[j].sentence]]
                        sample_vector.append([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]])
                        imp += important[j]
                        ti_s += title_sim[j]
                        to_s += topic_similarity[j]
                        recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name)
                        target_vector.append(recall[0]*precision[0]*F_measure[0])
                        recall_vector.append(recall[0])
                        precision_vector.append(precision[0])
                        F_measure_vector.append(F_measure[0])
		shutil.rmtree(newpath)
	for i in sample_vector:
                i[0] = i[0]/imp
                i[1] = i[1]/ti_s
                i[2] = i[2]/to_s
        x_recall_train, x_recall_test, y_recall_train, y_recall_test = cross_validation.train_test_split(sample_vector,recall_vector, test_size=0.2, random_state=0)
        x_precision_train, x_precision_test, y_precision_train, y_precision_test = cross_validation.train_test_split(sample_vector,precision_vector, test_size=0.2, random_state=0)
        x_fmeasure_train, x_fmeasure_test, y_fmeasure_train, y_fmeasure_test = cross_validation.train_test_split(sample_vector,F_measure_vector, test_size=0.2, random_state=0)
        #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3, 1e-4,1e-5],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
        r_clf =  GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5)
        p_clf =  GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5)
        f_clf =  GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5)
	r_clf.fit(sample_vector,recall_vector)
        p_clf.fit(sample_vector,precision_vector)
        f_clf.fit(sample_vector,F_measure_vector)
	index = 0
	produce_summary = []
	produce_name = []
	standard_summary = []
	standard_name = []
	lex_summary = []
	lex_name = []
	for i in mails:
		if(index<train):
			index += 1
			continue
		if(i.name == 'Re:_StarOffice' or i.name == 'Try_Unsubscribing&ndash;&ndash;You_Can\'t'):
			continue
		#print(i.name)
		vec,important,title_sim = process_mail(i)
                topic_similarity = lda_process(vec)
		phrase_list,vec_phrase_index = phrase_extraction(vec)
                word_list = word_score_calculate(phrase_list)
                vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index)
		cue_word_score = cue_word(vec)
                #word_list = word_score_calculate(phrase_extraction(vec))
                #print(word_list)
                tmp_produce = []
                standard_summary.append(anno(i))
                tmp_name = []
                for j in range(len(standard_summary[-1])):
                        tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt')
		standard_name.append(tmp_name)
		newpath = 'PythonROUGE/'+i.name
                if not os.path.exists(newpath):
                        os.makedirs(newpath)
		maxs = 0
        	maxi = 0
		tmp_summary = []
		predict_rouge = []
        	for j in range(len(vec)):
                        #tmp = r_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*p_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])
                        tmp = f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]])
			predict_rouge.append(tmp)
		sort_index = numpy.argsort(predict_rouge)[::-1]
		sort_index2 = numpy.argsort(important)[::-1]
		'''
		for j in range(10):
			tmp_summary.append(vec[sort_index[j]].sentence)
			tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt')
		'''
		lex_summary.append(vec[sort_index2[0]].sentence)
		tmp_summary.append(vec[sort_index[0]].sentence)
		produce_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt')
		lex_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_lex'+'.txt')
		produce_summary.append(tmp_summary)
	#print(standard_name)
	recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name)
	print('recall:')
	print(recall)
	print('precision:')
	print(precision)
	print('F_measure:')
	print(F_measure)
	recall,precision,F_measure = summary_eval(standard_summary,standard_name,lex_summary,lex_name)
	print('lex_recall:')
        print(recall)
        print('lex_precision:')
        print(precision)
        print('lex_F_measure:')
        print(F_measure)
	return f_clf
Exemple #2
0
	def parse_testset(self, testset_file):
		return xmlparser.parse_file(main=self, parent=self, file=testset_file)
Exemple #3
0
import xmlparser
import os

schema = xmlparser.parse_file(os.path.join(os.path.split(__file__)[0], "default.xml"))