tokens += nltk.regexp_tokenize(l,pattern="\w+") data = util.del_dup(tokens) if filepath[-4:-1] == 'neg': ins = Instance(filename,'negative',data,tokens) elif filepath[-4:-1] == 'pos': ins = Instance(filename,'positive',data,tokens) else: raise Exception, "Wrong path!" ins_list.append(ins) f.close() return ins_list instance_list = load_instance('txt_sentoken/neg/') instance_list += load_instance('txt_sentoken/pos/') #random split train-test p0 = random.random() p1 = 1-p0 prop = [p0,p1] #prop = [0.5,0.5] accuracy = split_train_test(nb,instance_list,prop,ID,limits) nb.save("movie_review_classifier.json") nb1 = NaiveBayes.load("movie_review_classifier.json") accuracy = split_train_test(nb,instance_list,prop,ID,limits) f = open('results/results'+repr(ID)+'.txt','a+w') f.write(repr(p0)+' '+repr(accuracy) + '\n') f.close()
label = [] data = [] for lines in f: if lines.strip(): datapoint = lines.split() label.append(datapoint[0]) data.append((datapoint[1],datapoint[2])) if not util.feature_codebook.has_label(datapoint[1]): util.feature_codebook.add(datapoint[1]) if not util.feature_codebook.has_label(datapoint[2]): util.feature_codebook.add(datapoint[2]) else: ins = Instance(label=label,data=data) ins_list.append(ins) label = [] data = [] return ins_list hmm = HMM() train_instance_list = load_instance("np_chunking_wsj_15_18_train") test_instance_list = load_instance("np_chunking_wsj_20_test") #hmm.train(train_instance_list) #get the confusion matrix accuracy=evaluator.split_train_test(hmm,train_instance_list,(0.5,0.5)) #print accuracy #hmm.train_semisupervised(test_instance_list) cm = evaluator.test_classifier(hmm,test_instance_list) cm.print_out()