def load_data(datasets): train_data = [[],[]] test_data = [[],[]] valid_data = [[],[]] train = 1000 test = 500 vectors = readVectors('bc3_vector_with_subject') corpus = 'bc3/bc3corpus.1.0/corpus.xml' annotation = 'bc3/bc3corpus.1.0/annotation.xml' mails = parse_bc3(corpus,annotation) assignVectors(mails,vectors) count = 0 trainSet,validSet,testSet = divide_data(len(mails),0.6,0.2,mails) def assignBinaryScore(dataset,output): for i in dataset: for j in i.text: if j.score>=0.32 and j.score<0.65: score = 1 elif j.score >= 0.65 and j.score<0.99: score = 2 elif j.score == 0: score = 0 else: score = 3 subject = i.subject_feature[int(j.index.split('.')[0]) - 1] thread = i.thread_feature[int(j.index.split('.')[0]) - 1] output[0].append(j.vector + subject + thread) output[1].append(score) assignBinaryScore(trainSet,train_data) assignBinaryScore(validSet,valid_data) assignBinaryScore(testSet,test_data) ''' for i in mails: for j in i.text: if j.score > 0: score = 1 else: score = 0 if count<train: train_data[0].append(j.vector + i.vector) train_data[1].append(score) elif count>train and count<train+test: test_data[0].append(j.vector + i.vector) test_data[1].append(score) else: valid_data[0].append(j.vector + i.vector) valid_data[1].append(score) count += 1 ''' return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]),testSet)
def calculate_sentence_features(mails): token_dict = {} stemmer = PorterStemmer() tfidf = tfidf_generize(mails,token_dict) for i in mails: for j in i.thread: for k in j.sentences: k.sentence_features = [] max_sum = 0 min_sum = 100 max_avg = 0 min_avg = 100 max_sub_sim = 0 min_sub_sim = 100 max_clue_score = 0 min_clue_score = 100 #clue_score_calculation(mails,repeat_words) speech_act_score(mails) for i in mails: relative_position(i) tmp_max,tmp_min = subject_sim(i) if tmp_max > max_sub_sim: max_sub_sim = tmp_max if tmp_min < min_sub_sim: min_sub_sim = tmp_min message_number(i) t_index = 0 for j in i.thread: tmp_subject = i.subject[t_index] subject_tfidf = numpy.array(tfidf.transform([tmp_subject]).todense())[0] t_index += 1 m_rel_pos(j) fol_quote(j) for k in j.sentences: length(k) is_question(k) k.tfidf = numpy.array(tfidf.transform([k.sentence]).todense())[0] tfidf_sum(k) ''' if k.tfidf_sum_score > max_sum: max_sum = k.tfidf_sum_score elif k.tfidf_sum_score <min_sum: min_sum = k.tfidf_sum_score ''' tfidf_avg(k) #print tmp_subject k.tfidf_subject_similarity_score = cosine_similarity(k.tfidf,subject_tfidf) ''' if k.tfidf_avg_score > max_avg: max_avg = k.tfidf_avg_score elif k.tfidf_avg_score < min_avg: min_avg = k.tfidf_avg_score if k.clue_score > max_clue_score: max_clue_score = k.clue_score elif k.clue_score < min_clue_score: min_clue_score = k.clue_score ''' vectors = readVectors('bc3_vector_with_subject') assignVectors(mails,vectors) for i in mails: #print len(i.thread) #print len(i.thread_feature) subjectivity_score(i) t_index = 0 for j in i.thread: #print len(i.thread) question_similarity(j,paragraph_sim) j.vector = i.thread_feature[t_index] j.subject_vector = i.subject_feature[t_index] t_index += 1 pre_sim(j,paragraph_sim) #print t_index for k in j.sentences: for l in i.text: if k.index == l.index: k.vector = l.vector k.subject_similarity_score = cosine_similarity(k.vector,j.subject_vector) k.topic_similarity_score = cosine_similarity(k.vector,j.vector) break #input_x = [] #input_y = [] clue_score_calculation(mails,repeat_words,paragraph_sim) sentiment_score(mails,'bc3_sentiment_vectors.txt') for i in mails: for j in i.thread: for k in j.sentences: #k.subject_sim_score = (k.subject_sim_score - min_sub_sim)/float(max_sub_sim-min_sub_sim) #k.tfidf_sum_score = (k.tfidf_sum_score - min_sum) / float(max_sum - min_sum) #k.tfidf_avg_score = (k.tfidf_avg_score - min_avg) / float(max_avg - min_avg) k.unnor_sentence_features = [k.req,k.dlv,k.cmt,k.prop,k.meet,k.ddata,k.subjectivity_score,k.tfidf_subject_similarity_score,k.clue_score,k.subject_sim_score,k.message_number_score,k.fol_quote_score,k.m_rel_pos_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.length_score,k.relative_position_score] #k.clue_score = (k.clue_score - min_clue_score) / float(max_clue_score - min_clue_score) k.para_features = [k.subjectivity_score,k.qa_score,k.question_similarity_score,k.seq_sim_score,k.para_clue_score,k.fol_quote_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.subject_similarity_score,k.topic_similarity_score] #print k.subject_sim_score #k.sentence_features = [k.clue_score,k.subject_sim_score,k.message_number_score,k.fol_quote_score,k.m_rel_pos_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.length_score,k.relative_position_score] #print k.sentence_features #input_x.append(k.sentence_features) #input_y.append(k.score) unnor_label = {'subj':'real','tfidf_sim':'real','clue':'real','sub_sim':'real','m_num':'real','fol':'real','rel':'real','is_q':'real','tfidf_sum':'real','tfidf_avg':'real','leng':'real','rel_pos':'real','score':'real'} para_label = {'sen':'real','subj':'real','qa':'real','qs':'real','seq':'real','clue':'real','fol':'real','is_q':'real','tfidf_sum':'real','tfidf_avg':'real','subj_sim':'real','topic_sim':'real','score':'real'} ''' clue_score_calculation(mails,paragraph_sim) for i in mails: for j in i.thread: for j in j.sentences: k.para_features = [k.clue_score,k.message_number_score,k.fol_quote_score,k.m_rel_pos_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.length_score,k.relative_position_score,k.subject_similarity_score,k.topic_similarity_score] ''' #input_x = numpy.asarray(input_x) #input_y = numpy.asarray(input_y) # basic feature extraction end #clue_score #clue_score end ''' vectors = readVectors('bc3_vector_with_subject') assignVectors(mails,vectors) for i in mails: #print len(i.thread) #print len(i.thread_feature) t_index = 0 for j in i.thread: #print len(i.thread) j.vector = i.thread_feature[t_index] j.subject_vector = i.subject_feature[t_index] t_index += 1 #print t_index for k in j.sentences: for l in i.text: if k.index == l.index: k.vector = l.vector k.subject_similarity_score = cosine_similarity(k.vector,j.subject_vector) k.topic_similarity_score = cosine_similarity(k.vector,j.vector) k.para_features = k.sentence_features k.para_features.append(k.subject_similarity_score) k.para_features.append(k.topic_similarity_score) break ''' ori = 0.0 para = 0.0 o_ro = 0.0 p_ro = 0.0 rnn_rouge = 0.0 rnn_score = 0.0 for p in range(5): train,valid,test = divide_data(len(mails),0.8,0,mails) if p == 0: tmp_rnn = rnn_summ(train,test,"rnn_hidden") else: tmp_rnn = rnn_summ(train,test) tmp_rnn.init_rnn(0.01,0) rate = 0.33 for j in range(6000): if j % 100 == 0: rate = rate * 0.9 error = tmp_rnn.rnn_train(rate) if j % 100 == 0: print error if p==0: tmp_rnn.close_file() unnor_input_x = [] unnor_test_x = [] para_input_x = [] para_test_x = [] input_x = [] input_y = [] for i in train: for j in i.thread: for k in j.sentences: input_x.append(k.sentence_features) input_y.append(k.score) unnor_input_x.append(k.unnor_sentence_features) para_input_x.append(k.para_features) test_x = [] test_y = [] for i in test: for j in i.thread: for k in j.sentences: test_x.append(k.sentence_features) unnor_test_x.append(k.unnor_sentence_features) para_test_x.append(k.para_features) test_y.append(k.score) tmp_input = unnor_input_x + unnor_test_x tmp_input = preprocessing.scale(tmp_input) unnor_input_x = tmp_input[0:len(unnor_input_x)] unnor_test_x = tmp_input[len(unnor_input_x):len(tmp_input)] tmp_input = para_input_x + para_test_x tmp_input = preprocessing.scale(tmp_input) para_input_x = tmp_input[0:len(para_input_x)] para_test_x = tmp_input[len(para_input_x):len(tmp_input)] def eval(input_x,input_y,test_x,test,label,write_folder = None): tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] grid_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters) grid_clf.fit(input_x,input_y) print "params : \t" print grid_clf.get_params() result = grid_clf.predict(test_x) #py_weka = python_weka(input_x,input_y,label) #py_weka.train() #result = py_weka.predict(test_x) #py_weka.close() #clf = SVR(C=1.0, epsilon=0.2) #clf.fit(input_x,input_y) #result = clf.predict(test_x) score_index = 0 produce_set = [] for i in test: produce_set.append([]) score_list = [] index_list = [] for j in i.thread: for k in j.sentences: k.predict_score = result[score_index] score_index += 1 score_list.append(k.predict_score) index_list.append(k.index) sorted_index_array = sorted_index(score_list) sen_length = 0 for j in range(len(index_list)): if sen_length < float(len(index_list))*0.3: produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]]) sen_length += 1 else: break score = weightRecall(test,produce_set,write_folder) print score rouge_eval = rouge(test,produce_set) rouge_score = rouge_eval.eval()['rouge_l_f_score'] print rouge_score return score,rouge_score print "rnn:" rnn_tmp_score,rnn_tmp_rouge = tmp_rnn.rnn_test() ''' if p != 0: rnn_tmp_score,rnn_tmp_rouge = tmp_rnn.rnn_test() else: rnn_tmp_score,rnn_tmp_rouge = tmp_rnn.rnn_test("rnn_folder") ''' rnn_score += rnn_tmp_score rnn_rouge += rnn_tmp_rouge print "avg rnn:" print rnn_score print "avg rnn rouge:" print rnn_rouge print "\n" print "ori:" if p == 0: ori_score,ori_rouge = eval(unnor_input_x,input_y,unnor_test_x,test,unnor_label,"ori_folder") else: ori_score,ori_rouge = eval(unnor_input_x,input_y,unnor_test_x,test,unnor_label) ori += ori_score o_ro += ori_rouge print "avg_ori:" print ori print "avg_ori_rouge:" print o_ro print "\n" print "para" if p != 4: para_score,para_rouge = eval(para_input_x,input_y,para_test_x,test,para_label) else: para_score,para_rouge = eval(para_input_x,input_y,para_test_x,test,para_label,"para_folder") para += para_score p_ro += para_rouge print "avg_para:" print para print "avg_para_rouge:" print p_ro print "\n"