def load_data(dataset): train_data = [[],[]] test_data = [[],[]] valid_data = [[],[]] train = 1000 test = 500 vectors = readVectors('bc3_vectors.txt') corpus = 'bc3/bc3corpus.1.0/corpus.xml' annotation = 'bc3/bc3corpus.1.0/annotation.xml' mails = parse_bc3(corpus,annotation) assignVectors(mails,vectors) count = 0 for i in mails: for j in i.text: if j.score > 0: score = 1 else: score = 0 if count<train: train_data[0].append(j.vector + i.vector) train_data[1].append(score) elif count>train and count<train+test: test_data[0].append(j.vector + i.vector) test_data[1].append(score) else: valid_data[0].append(j.vector + i.vector) valid_data[1].append(score) count += 1 return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]))
def load_data(datasets): train_data = [[],[]] test_data = [[],[]] valid_data = [[],[]] train = 1000 test = 500 vectors = readVectors('bc3_vector_with_subject') corpus = 'bc3/bc3corpus.1.0/corpus.xml' annotation = 'bc3/bc3corpus.1.0/annotation.xml' mails = parse_bc3(corpus,annotation) assignVectors(mails,vectors) count = 0 trainSet,validSet,testSet = divide_data(len(mails),0.6,0.2,mails) def assignBinaryScore(dataset,output): for i in dataset: for j in i.text: if j.score>=0.32 and j.score<0.65: score = 1 elif j.score >= 0.65 and j.score<0.99: score = 2 elif j.score == 0: score = 0 else: score = 3 subject = i.subject_feature[int(j.index.split('.')[0]) - 1] thread = i.thread_feature[int(j.index.split('.')[0]) - 1] output[0].append(j.vector + subject + thread) output[1].append(score) assignBinaryScore(trainSet,train_data) assignBinaryScore(validSet,valid_data) assignBinaryScore(testSet,test_data) ''' for i in mails: for j in i.text: if j.score > 0: score = 1 else: score = 0 if count<train: train_data[0].append(j.vector + i.vector) train_data[1].append(score) elif count>train and count<train+test: test_data[0].append(j.vector + i.vector) test_data[1].append(score) else: valid_data[0].append(j.vector + i.vector) valid_data[1].append(score) count += 1 ''' return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]),testSet)
index += 1 def m_rel_pos(thread): index = 0.0 for i in thread.sentences: i.m_rel_pos_score = index/float(len(thread.sentences)) index += 1 def pre_sim(thread,sim_func): for i in range(len(thread.sentences)): if i == 0: if (i+1) < len(thread.sentences): thread.sentences[i].seq_sim_score = sim_func(thread.sentences[i],thread.sentences[i+1]) else: thread.sentences[i].seq_sim_score = 0.0 elif i == (len(thread.sentences) - 1): thread.sentences[i].seq_sim_score = sim_func(thread.sentences[i],thread.sentences[i-1]) else: thread.sentences[i].seq_sim_score = max(sim_func(thread.sentences[i],thread.sentences[i+1]),sim_func(thread.sentences[i],thread.sentences[i-1])) ''' def next_sim(thread,sim_func): def centroid_sim(thread,sim_func): ''' corpus = 'bc3/bc3corpus.1.0/corpus.xml' annotation = 'bc3/bc3corpus.1.0/annotation.xml' mails = parse_bc3(corpus,annotation) #jvm.start() calculate_sentence_features(mails) #jvm.stop()