Ejemplo n.º 1
0
			tokens += nltk.regexp_tokenize(l,pattern="\w+")
		data = util.del_dup(tokens)
		if filepath[-4:-1] == 'neg':
			ins = Instance(filename,'negative',data,tokens)	
		elif filepath[-4:-1] == 'pos':
			ins = Instance(filename,'positive',data,tokens)
		else: 
			raise Exception, "Wrong path!"
		ins_list.append(ins)
	f.close()
    	return ins_list

instance_list = load_instance('txt_sentoken/neg/')
instance_list += load_instance('txt_sentoken/pos/')
#random split train-test
p0 = random.random()
p1 = 1-p0
prop = [p0,p1]
#prop = [0.5,0.5]

accuracy = split_train_test(nb,instance_list,prop,ID,limits)
   
nb.save("movie_review_classifier.json")
nb1 = NaiveBayes.load("movie_review_classifier.json")

accuracy = split_train_test(nb,instance_list,prop,ID,limits)

f = open('results/results'+repr(ID)+'.txt','a+w')
f.write(repr(p0)+' '+repr(accuracy) + '\n')
f.close()
Ejemplo n.º 2
0
	label = []
	data = []
	for lines in f:
		if lines.strip():
			datapoint = lines.split()
			label.append(datapoint[0])
			data.append((datapoint[1],datapoint[2]))
			if not util.feature_codebook.has_label(datapoint[1]):
				util.feature_codebook.add(datapoint[1])
			if not util.feature_codebook.has_label(datapoint[2]):
				util.feature_codebook.add(datapoint[2])
		else:
			ins = Instance(label=label,data=data)
			ins_list.append(ins)
			label = []
			data = []	
	return ins_list

hmm = HMM()
train_instance_list = load_instance("np_chunking_wsj_15_18_train")
test_instance_list = load_instance("np_chunking_wsj_20_test")
#hmm.train(train_instance_list)
#get the confusion matrix
accuracy=evaluator.split_train_test(hmm,train_instance_list,(0.5,0.5))
#print accuracy
#hmm.train_semisupervised(test_instance_list)
cm = evaluator.test_classifier(hmm,test_instance_list)
cm.print_out()