Beispiel #1
0
	label = []
	data = []
	for lines in f:
		if lines.strip():
			datapoint = lines.split()
			label.append(datapoint[0])
			data.append((datapoint[1],datapoint[2]))
			if not util.feature_codebook.has_label(datapoint[1]):
				util.feature_codebook.add(datapoint[1])
			if not util.feature_codebook.has_label(datapoint[2]):
				util.feature_codebook.add(datapoint[2])
		else:
			ins = Instance(label=label,data=data)
			ins_list.append(ins)
			label = []
			data = []	
	return ins_list

hmm = HMM()
train_instance_list = load_instance("np_chunking_wsj_15_18_train")
test_instance_list = load_instance("np_chunking_wsj_20_test")
#hmm.train(train_instance_list)
#get the confusion matrix
accuracy=evaluator.split_train_test(hmm,train_instance_list,(0.5,0.5))
#print accuracy
#hmm.train_semisupervised(test_instance_list)
cm = evaluator.test_classifier(hmm,test_instance_list)
cm.print_out()


Beispiel #2
0
# classifier = nltk.NaiveBayesClassifier.train(train_set) 
# print nltk.classify.accuracy(classifier, test_set)
# #print classifier.labels()
# CM = test_classifier(classifier,test_set)
# CM.print_out()

#----------------------------------  
#MaxEnt training
#ME = MaxEnt()
#ME.train(instance_list)
#ME.save("dependency_parsing_classifier.json")
#finish training
#----------------------------------
#testing parser and then use the loaded ME classifier to do the decoding thing and write the parsing result in file parser.conll
ME = MaxEnt.load("dependency_parsing_classifier.json")
CM = test_classifier(ME,test_instance_list)
CM.print_out()
tranSys = TranSys(transition_codebook)
wfile = open('parser.conll','w')
for test_sentence in test_sentence_instances:
	new_sentence = tranSys.decode_parser(ME,test_sentence)
	for element in new_sentence:
		if element[0] != 0:
			#wfile.write('{0:<10}{1:<15}{2:<10}{3:<10}{4:<10}{5:<10}{6:<10}{7:<10}{8:<10}{9:<10}'.format(element[0],element[1],'_',element[2],element[2],'_',element[3],'_','_','_'))
			wfile.write(str(element[0])+'\t'+str(element[1])+'\t'+'_'+'\t'+str(element[2])+'\t'+str(element[2])+'\t'+'_'+'\t'+str(element[3])+'\t'+str(element[4])+'\t'+'_'+'\t'+'_')
			wfile.write("\n")	
	wfile.write("\r\n")

wfile.close()

def main():
	parser = AP.ArgumentParser(description = "A command-line interface for " \
		"the maximum entropy classifier.")
	parser.add_argument("-d", "--datafile", action = "store", default = "blog-gender-dataset.txt", \
		help = "specify the input data file (default: ")
	parser.add_argument("-g", "--gaussian_prior", dest = "gpv", action = "store", \
		help = "specify the Gaussian prior variance")
	parser.add_argument("-m", "--mode", dest = "mode", action = "store", default = "train", \
		help = "run as train, train/ test, exp(eriment)1, exp(eriment)2, exp(eriment)3")
	parser.add_argument("-s", "--save", dest = "outfile", action = "store", default = None, \
		help = "specify output file to serialize trained classifier")
	parser.add_argument("-l", "--load", dest = "infile", action = "store", default = None, \
		help = "specify input file to load trained classifier")
	parser.add_argument("-i", "--instances", dest = "instances", action = "store", default = None, \
		help = "load preprocessed instances instead of data")
	parser.add_argument("-f", "--featurefile", dest = "featfile", action = "store", default = None, \
		help = "serialize preprocessed instances")	
	
	args = parser.parse_args() #parse argument structure
	
	#begin running classifier
	try:
		print "Importing data ... "
		if args.instances: #get serialized features
			instance_list = cPickle.load(open(args.instances, 'rb'))
			print "Done."
		else: #create features from data
			data_list = import_data(args.datafile)
			print "Done.\nExtracting features ... "
			instance_list = []
			l = len(data_list)
			for i, (label, post) in enumerate(data_list):
				print "Featurizing string %d of %d ... " % (i, l)
				instance_list.append(Instance(label = label, data = featurize(post)))
			print "Done."
		if args.featfile: #serialize instance_list
			with open(args.featfile, 'wb') as outf:
				cPickle.dump(instance_list, outf)
		piv1 = int(.7 * len(instance_list)) #split training from test
		piv2 = int(.9 * len(instance_list)) #split test from dev
		training, test, dev = instance_list[:piv1], instance_list[piv1:piv2], \
			instance_list[piv2:]
			
		if args.infile: #load a previously trained classifier
			with open(args.infile, 'rb') as inf:
				me_classifier = MaxEnt.from_dict(cPickle.load(inf))
		else: #create a new classifier
			exec('me_classifier = MaxEnt(%s)' % args.gpv)

		#experiment one
		if re.search(r'exp.*1', args.mode):
		
			if not args.infile:
				print "Training classifier ... "
				me_classifier.train(training)
				print "Done.\nTesting classification ... "
			if args.outfile:
				with open(args.outfile, 'wb') as outf:
					cPickle.dump(me_classifier.to_dict(), outf)
		
			for data in [training, test]:
				test_classifier(me_classifier, data).print_out()
				
		#experiment two; run in batch as for i in {.05,...,numpy.Infinity} ...
		#run with -s $i.classifier
		elif re.search(r'exp.*2', args.mode):
			#for value in [.05, 0.1, .5, 1, 3, 5, 10, numpy.Infinity]:
			#for value in [10, numpy.Infinity]:
			#me_classifier = MaxEnt(value)
			print "Training classifier with Gaussian prior variance %s ..." \
				% str(me_classifier.gaussian_prior_variance)
			me_classifier.train(training)
			print "Done. Testing classifier over dev set ..."
			test_classifier(me_classifier, dev).print_out()
			print "Done. Testing classifier over test set ..."
			test_classifier(me_classifier, test).print_out()
			print "Done.\n\n\n"
			
		#experiment three; run with -l 1.classifier
		elif re.search(r'exp.*3', args.mode):
			if not args.infile:
				print "Training Maximum Entropy classifier ... "
				me_classifier.train(training)
				print "Done."
			nb_classifier = NaiveBayes()
			print "Training Naive Bayes classifier ... "
			nb_classifier.train(training)
			print "Done.\nTesting Maximum Entropy over test set ... "
			test_classifier(me_classifier, test).print_out()
			print "Done.\nTesting Naive Bayes over test set ... "
			test_classifier(nb_classifier, test).print_out()
			
		if args.outfile: #serialize trained classifier
			with open(args.outfile, 'wb') as outf:
				cPickle.dump(me_classifier.to_dict(), outf)

	except: #something is WROOOONG
		parser.print_help()
		raise