Ejemplo n.º 1
0
def load_data(dataset):
	train_data = [[],[]]
	test_data = [[],[]]
	valid_data = [[],[]]
	train = 1000
	test = 500
	vectors = readVectors('bc3_vectors.txt')
	corpus = 'bc3/bc3corpus.1.0/corpus.xml'
	annotation = 'bc3/bc3corpus.1.0/annotation.xml'
	mails = parse_bc3(corpus,annotation)
	assignVectors(mails,vectors)
	count = 0
	for i in mails:
        	for j in i.text:
			if j.score > 0:
                        	score = 1
                        else:
                        	score = 0
			if count<train:
                		train_data[0].append(j.vector + i.vector)
				train_data[1].append(score)
			elif count>train and count<train+test:
				test_data[0].append(j.vector + i.vector)
                                test_data[1].append(score)
			else:
				valid_data[0].append(j.vector + i.vector)
                                valid_data[1].append(score)
			count += 1
	return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]))
Ejemplo n.º 2
0
def load_data(datasets):
	train_data = [[],[]]
	test_data = [[],[]]
	valid_data = [[],[]]
	train = 1000
	test = 500
	vectors = readVectors('bc3_vector_with_subject')
	corpus = 'bc3/bc3corpus.1.0/corpus.xml'
	annotation = 'bc3/bc3corpus.1.0/annotation.xml'
	mails = parse_bc3(corpus,annotation)
	assignVectors(mails,vectors)
	count = 0
	
	trainSet,validSet,testSet = divide_data(len(mails),0.6,0.2,mails)
	def assignBinaryScore(dataset,output):
		for i in dataset:
			for j in i.text:
				if j.score>=0.32 and j.score<0.65:
					score = 1
				elif j.score >= 0.65 and j.score<0.99:
					score = 2
				elif j.score == 0:
					score = 0
				else:
					score = 3
				subject = i.subject_feature[int(j.index.split('.')[0]) - 1]
                                thread = i.thread_feature[int(j.index.split('.')[0]) - 1]
				output[0].append(j.vector + subject + thread)
				output[1].append(score)
	assignBinaryScore(trainSet,train_data)
	assignBinaryScore(validSet,valid_data)
	assignBinaryScore(testSet,test_data)
	'''
	for i in mails:
        	for j in i.text:
			if j.score > 0:
                        	score = 1
                        else:
                        	score = 0
			if count<train:
                		train_data[0].append(j.vector + i.vector)
				train_data[1].append(score)
			elif count>train and count<train+test:
				test_data[0].append(j.vector + i.vector)
                                test_data[1].append(score)
			else:
				valid_data[0].append(j.vector + i.vector)
                                valid_data[1].append(score)
			count += 1
	'''
	return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]),testSet)
		index += 1
def m_rel_pos(thread):
	index = 0.0
	for i in thread.sentences:
		i.m_rel_pos_score = index/float(len(thread.sentences))
		index += 1


def pre_sim(thread,sim_func):
	for i in range(len(thread.sentences)):
		if i == 0:
			if (i+1) < len(thread.sentences):
				thread.sentences[i].seq_sim_score = sim_func(thread.sentences[i],thread.sentences[i+1])
			else:
				thread.sentences[i].seq_sim_score = 0.0
		elif i == (len(thread.sentences) - 1):
			thread.sentences[i].seq_sim_score = sim_func(thread.sentences[i],thread.sentences[i-1])
		else:
			thread.sentences[i].seq_sim_score = max(sim_func(thread.sentences[i],thread.sentences[i+1]),sim_func(thread.sentences[i],thread.sentences[i-1]))
'''
def next_sim(thread,sim_func):

def centroid_sim(thread,sim_func):
'''
corpus = 'bc3/bc3corpus.1.0/corpus.xml'
annotation = 'bc3/bc3corpus.1.0/annotation.xml'
mails = parse_bc3(corpus,annotation)
#jvm.start()
calculate_sentence_features(mails)
#jvm.stop()