label = [] data = [] for lines in f: if lines.strip(): datapoint = lines.split() label.append(datapoint[0]) data.append((datapoint[1],datapoint[2])) if not util.feature_codebook.has_label(datapoint[1]): util.feature_codebook.add(datapoint[1]) if not util.feature_codebook.has_label(datapoint[2]): util.feature_codebook.add(datapoint[2]) else: ins = Instance(label=label,data=data) ins_list.append(ins) label = [] data = [] return ins_list hmm = HMM() train_instance_list = load_instance("np_chunking_wsj_15_18_train") test_instance_list = load_instance("np_chunking_wsj_20_test") #hmm.train(train_instance_list) #get the confusion matrix accuracy=evaluator.split_train_test(hmm,train_instance_list,(0.5,0.5)) #print accuracy #hmm.train_semisupervised(test_instance_list) cm = evaluator.test_classifier(hmm,test_instance_list) cm.print_out()
# classifier = nltk.NaiveBayesClassifier.train(train_set) # print nltk.classify.accuracy(classifier, test_set) # #print classifier.labels() # CM = test_classifier(classifier,test_set) # CM.print_out() #---------------------------------- #MaxEnt training #ME = MaxEnt() #ME.train(instance_list) #ME.save("dependency_parsing_classifier.json") #finish training #---------------------------------- #testing parser and then use the loaded ME classifier to do the decoding thing and write the parsing result in file parser.conll ME = MaxEnt.load("dependency_parsing_classifier.json") CM = test_classifier(ME,test_instance_list) CM.print_out() tranSys = TranSys(transition_codebook) wfile = open('parser.conll','w') for test_sentence in test_sentence_instances: new_sentence = tranSys.decode_parser(ME,test_sentence) for element in new_sentence: if element[0] != 0: #wfile.write('{0:<10}{1:<15}{2:<10}{3:<10}{4:<10}{5:<10}{6:<10}{7:<10}{8:<10}{9:<10}'.format(element[0],element[1],'_',element[2],element[2],'_',element[3],'_','_','_')) wfile.write(str(element[0])+'\t'+str(element[1])+'\t'+'_'+'\t'+str(element[2])+'\t'+str(element[2])+'\t'+'_'+'\t'+str(element[3])+'\t'+str(element[4])+'\t'+'_'+'\t'+'_') wfile.write("\n") wfile.write("\r\n") wfile.close()
def main(): parser = AP.ArgumentParser(description = "A command-line interface for " \ "the maximum entropy classifier.") parser.add_argument("-d", "--datafile", action = "store", default = "blog-gender-dataset.txt", \ help = "specify the input data file (default: ") parser.add_argument("-g", "--gaussian_prior", dest = "gpv", action = "store", \ help = "specify the Gaussian prior variance") parser.add_argument("-m", "--mode", dest = "mode", action = "store", default = "train", \ help = "run as train, train/ test, exp(eriment)1, exp(eriment)2, exp(eriment)3") parser.add_argument("-s", "--save", dest = "outfile", action = "store", default = None, \ help = "specify output file to serialize trained classifier") parser.add_argument("-l", "--load", dest = "infile", action = "store", default = None, \ help = "specify input file to load trained classifier") parser.add_argument("-i", "--instances", dest = "instances", action = "store", default = None, \ help = "load preprocessed instances instead of data") parser.add_argument("-f", "--featurefile", dest = "featfile", action = "store", default = None, \ help = "serialize preprocessed instances") args = parser.parse_args() #parse argument structure #begin running classifier try: print "Importing data ... " if args.instances: #get serialized features instance_list = cPickle.load(open(args.instances, 'rb')) print "Done." else: #create features from data data_list = import_data(args.datafile) print "Done.\nExtracting features ... " instance_list = [] l = len(data_list) for i, (label, post) in enumerate(data_list): print "Featurizing string %d of %d ... " % (i, l) instance_list.append(Instance(label = label, data = featurize(post))) print "Done." if args.featfile: #serialize instance_list with open(args.featfile, 'wb') as outf: cPickle.dump(instance_list, outf) piv1 = int(.7 * len(instance_list)) #split training from test piv2 = int(.9 * len(instance_list)) #split test from dev training, test, dev = instance_list[:piv1], instance_list[piv1:piv2], \ instance_list[piv2:] if args.infile: #load a previously trained classifier with open(args.infile, 'rb') as inf: me_classifier = MaxEnt.from_dict(cPickle.load(inf)) else: #create a new classifier exec('me_classifier = MaxEnt(%s)' % args.gpv) #experiment one if re.search(r'exp.*1', args.mode): if not args.infile: print "Training classifier ... " me_classifier.train(training) print "Done.\nTesting classification ... " if args.outfile: with open(args.outfile, 'wb') as outf: cPickle.dump(me_classifier.to_dict(), outf) for data in [training, test]: test_classifier(me_classifier, data).print_out() #experiment two; run in batch as for i in {.05,...,numpy.Infinity} ... #run with -s $i.classifier elif re.search(r'exp.*2', args.mode): #for value in [.05, 0.1, .5, 1, 3, 5, 10, numpy.Infinity]: #for value in [10, numpy.Infinity]: #me_classifier = MaxEnt(value) print "Training classifier with Gaussian prior variance %s ..." \ % str(me_classifier.gaussian_prior_variance) me_classifier.train(training) print "Done. Testing classifier over dev set ..." test_classifier(me_classifier, dev).print_out() print "Done. Testing classifier over test set ..." test_classifier(me_classifier, test).print_out() print "Done.\n\n\n" #experiment three; run with -l 1.classifier elif re.search(r'exp.*3', args.mode): if not args.infile: print "Training Maximum Entropy classifier ... " me_classifier.train(training) print "Done." nb_classifier = NaiveBayes() print "Training Naive Bayes classifier ... " nb_classifier.train(training) print "Done.\nTesting Maximum Entropy over test set ... " test_classifier(me_classifier, test).print_out() print "Done.\nTesting Naive Bayes over test set ... " test_classifier(nb_classifier, test).print_out() if args.outfile: #serialize trained classifier with open(args.outfile, 'wb') as outf: cPickle.dump(me_classifier.to_dict(), outf) except: #something is WROOOONG parser.print_help() raise