class SKClassifier: classifier = None def __init__(self, cls='SVC'): self.classifier = SklearnClassifier({ 'SVC': SVC(), 'LogisticRegression': LogisticRegression(), 'BernoulliNB': BernoulliNB() }[cls]) if not self.classifier: self.classifier = SklearnClassifier(SVC()) def train(self, trainset): self.classifier.train(trainset) def test(self, tagged, featuresets): predict = self.classifier.classify_many(featuresets) print predict return accuracy_score(tagged, predict) def classify(self, featureset): return self.classifier.classify(featureset) def classify_many(self, featuresets): return self.classifier.classify_many(featuresets)
class chatBot(object): def __init__(self): self.posts = nltk.corpus.nps_chat.xml_posts() self.categories = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'] self.mapper = [0, 2, 6, 3, 11, 5, 8, 1, 8, 3, 10, 11, 13, 13, 13] self.responses = {} self.featuresets = [] self.train = [] self.test = [] self.testSet = [] self.testSetClass = [] self.classif = SklearnClassifier(LinearSVC()) for i in range(0, 15): self.responses[i] = [] for post in self.posts: self.featuresets.append((self.tokenize(post.text),self.categories.index(post.get('class')))) self.temp = self.responses[self.categories.index(post.get('class'))] self.temp.append(post.text) def tokenize(self, sentence): """ Extracts a set of features from a message. """ features = {} tokens = nltk.word_tokenize(sentence) for t in tokens: features['contains(%s)' % t.lower()] = True return features def talk(self): while 1: inp = raw_input("YOU: ") features = self.tokenize(inp) pp = self.classif.classify_many(features) pp = pp[0] pp = int(pp) m = self.mapper[pp] r = self.responses[m] val = randint(0, len(r)) print("BOT: "+r[val]) def trainSet(self): shuffle(self.featuresets) size = int(len(self.featuresets) * .1) # 10% is used for the test set self.train = self.featuresets[size:] self.test = self.featuresets[:size] self.classif.train(self.train) self.testSet = [] self.testSetClass = [] for i in self.test: self.testSet.append(i[0]) self.testSetClass.append(i[1]) self.batch = self.classif.classify_many(self.testSet) def statistics(self): print (classification_report(self.testSetClass, self.batch, labels=list(set(self.testSetClass)),target_names=self.categories))
def ClassAccuracy(classifier, train_set, test_set): """To use classifiers of scikit-learn in nltk For classifiers, I've written my own NaiveBayes Classifier and I also considered several available classifiers in sklearn like {'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC'}. Args: classifier: You can choose any classifier in sklearn, For example: BernoulliNB() train_set: The already labeled and features extracted training set, For example: [({'bad':-5,'good':5,...},'+1')] test_set: The already labeled testing set. The form is the same as train_set. Returns: accuracy_score: The accuracy of of your trained classifier by predict your test set. """ classifier = SklearnClassifier(classifier) classifier.train(train_set) pred = classifier.classify_many([ features for (features, label) in test_set ]) # do prediction on test set return accuracy_score([label for (features, label) in test_set], pred) # compare pred and label, give accuracy
def clf_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train_set) # nltk.classify.scikitlearn(BernoulliNB()) predict = classifier.classify_many(test) # classifier.prob_classify_many() return accuracy_score(tag_test, predict)
def score(classifier): classifier = SklearnClassifier(classifier) #在nltk中使用scikit-learn的接口 classifier.train(train) #训练分类器 pred_pos = classifier.classify_many(data_pos) #对测试集的数据进行分类,给出预测的标签 pred_neg = classifier.classify_many(data_neg) n = 0 n2 = 0 for i in range(0, pos_len): if pred_pos[i] == tag_pos[i]: n += 1 for i in range(0, neg_len): if pred_neg[i] == tag_neg[i]: n2 += 1 pos_precision = n / (n + neg_len - n2) pos_recall = n / pos_len neg_precision = n2 / (n2 + pos_len - n) neg_recall = n2 / neg_len pos_F = 2 * pos_precision * pos_recall / (pos_precision + pos_recall) neg_F = 2 * neg_precision * neg_recall / (neg_precision + neg_recall) print(pos_F, neg_F) savePath.write('正面情感准确率为: %.2f\n' % pos_precision) savePath.write('负面情感准确率为: %.2f\n' % neg_precision) savePath.write('正面情感召回率为: %.2f\n' % pos_recall) savePath.write('负面情感召回率为: %.2f\n' % neg_recall) savePath.write('正面F-Measure值为: %.2f\n' % pos_F) savePath.write('负面F-Measure值为: %.2f\n' % neg_F) return pos_precision, neg_precision, pos_recall, neg_recall #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train) pred = classifier.classify_many(data) n = 0 m = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i] and pred[i] == 'neg': n = n + 1 if pred[i] == 'neg': m = m + 1 return float(n) / float(m)
def score(trainset, testset, classifier): classifier = SklearnClassifier(classifier) classifier._vectorizer.sort = False classifier.train(trainset) (test, tag_test) = zip(*testset) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
def buildAndTrainClassifier(self, X, Y): print('Building and Training Classifier') n_folds = 10 kf = KFold(n_splits=n_folds, shuffle=True, random_state=42) fold = 1 for train, test in kf.split(X): print('Training on cross validation set ', fold) train_X = np.array(X)[train] test_X = np.array(X)[test] train_y = np.array(Y)[train] test_y = np.array(Y)[test] print('Training size: ', len(train_y)) print('Test size: ', len(test_y)) labeled_features = list(zip(train_X, train_y)) #print(labeled_features[0]) #print(type(train_X[0])) model = SklearnClassifier(MultinomialNB()).train(labeled_features) predicted = model.classify_many(test_X) fold += 1 print('Confusion matrix =', confusion_matrix(test_y, predicted)) print('Precision score =', precision_score(predicted, test_y, average=None)) print('Recall score =', recall_score(predicted, test_y, average=None)) print('Accuracy score =', accuracy_score(predicted, test_y)) print('Training score =', f1_score(predicted, test_y, average=None)) return model
def SVM(training_set, test_set): classifier = SklearnClassifier(LinearSVC()) print("Training a new SVM classifier") classifier.train(training_set) print("Accuracy of SVM in training:",nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) #print("Running new Decision Tree classifier") accuracy = nltk.classify.accuracy(classifier, test_set) trueLabels = [l for d, l in test_set] predictedLabels = classifier.classify_many([d for d,t in test_set]) #print("Accuracy:",accuracy) # classifier.show_most_informative_features(MIF) def runTrained(test_set, hasTags=False): #print("Running pre-trained Decision Tree classifier") if hasTags: tagglessTest_set = [data for data, tag in test_set] acc = nltk.classify.accuracy(classifier, test_set) print("Accuracy:", acc) predictions = classifier.classify_many(tagglessTest_set) return ([e for e in zip(tagglessTest_set, predictions)], acc) else: tagglessTest_set = test_set predictions = classifier.classify_many(tagglessTest_set) #print("Predicted Labels:",predictions) return [e for e in zip(tagglessTest_set, predictions)] return (runTrained, accuracy, predictedLabels, trueLabels)
def learn_model(data,target): bestwords = best_of_words(data, target) # preparing data for split validation. 80% training, 20% test data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43) #classifier = BernoulliNB().fit(data_train,target_train) train_feature=[] test_feature=[] for i in range(len(data_train)): d=data_train[i] d=jieba.cut(d, cut_all=False) l=target_train[i] #tmp=[bigram(d),l] tmp = [dict([(word, True) for word in d if word in bestwords]), l] train_feature.append(tmp) for i in range(len(data_test)): d=data_test[i] d=jieba.cut(d, cut_all=False) l=target_test[i] #tmp=bigram(d) tmp = dict([(word, True) for word in d if word in bestwords]) test_feature.append(tmp) classifier = SklearnClassifier(MultinomialNB()) classifier.train(train_feature) predicted = classifier.classify_many(test_feature) evaluate_model(target_test,predicted) return classifier, bestwords
def build_classifier_score(train_set, test_set, classifier): data, tag = zip(*test_set) classifier = SklearnClassifier(classifier) classifier.train(train_set) pred = classifier.classify_many(data) return accuracy_score(tag, pred)
def SVM(training_set, test_set): classifier = SklearnClassifier(LinearSVC()) print("Training a new SVM classifier") classifier.train(training_set) print("Accuracy of SVM in training:", nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) #print("Running new Decision Tree classifier") accuracy = nltk.classify.accuracy(classifier, test_set) trueLabels = [l for d, l in test_set] predictedLabels = classifier.classify_many([d for d, t in test_set]) #print("Accuracy:",accuracy) # classifier.show_most_informative_features(MIF) def runTrained(test_set, hasTags=False): #print("Running pre-trained Decision Tree classifier") if hasTags: tagglessTest_set = [data for data, tag in test_set] acc = nltk.classify.accuracy(classifier, test_set) print("Accuracy:", acc) predictions = classifier.classify_many(tagglessTest_set) return ([e for e in zip(tagglessTest_set, predictions)], acc) else: tagglessTest_set = test_set predictions = classifier.classify_many(tagglessTest_set) #print("Predicted Labels:",predictions) return [e for e in zip(tagglessTest_set, predictions)] return (runTrained, accuracy, predictedLabels, trueLabels)
def score(trainset, testset, classifier): classifier = SklearnClassifier(classifier) classifier._vectorizer.sort = False classifier.train(trainset) (test, tag_test) = zip(*testset) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
def get_recall(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) pred = classifier.classify_many(test) acc_score = accuracy_score(tag_test, pred) recall_s = acc_score - 0.05 return recall_s
def linear_model(train_df, cv_df): train_tokenized_data = get_data_for_nltk(train_df) cv_tokenized_data = get_data_for_nltk(cv_df) # Set up SGD Classifier via scikitlearn wrapper # log loss makes sure its logistic regression classifier = SklearnClassifier(SGDClassifier(loss='log', max_iter=5)) print("Training Linear Classifier...") classifier.train(train_tokenized_data) print("Training Done") print("Saving Linear Classifier") save_pickle(classifier, "linear_model.pickle") cv_accuracy = nltk.classify.accuracy(classifier, cv_tokenized_data) print("Cross Validation Accuracy of Linear Classifier {}%".format( round(cv_accuracy * 100, 2))) predictions = classifier.classify_many([x for (x, y) in cv_tokenized_data]) print("Plotting Learning Curve...") build_learning_curve(classifier, train_tokenized_data, cv_tokenized_data) print("Saving precision, recall and F-scores in performance.txt") with open('performance.txt', 'w') as text_file: text_file.write('Linear Model Expected Performance Below:') text_file.write('\n') text_file.write( classification_report(cv_df['label'].tolist(), predictions))
def performCrossValidation(featureset, labels, foldsCount, sklearnclassifier, uniqLabels): accuracySum = 0.0 precisionSums = defaultdict(float) recallSums = defaultdict(float) fscoreSums = defaultdict(float) crossValidationIterations = cross_validation.StratifiedKFold(labels, n_folds=foldsCount) for train, test in crossValidationIterations: trainset = [featureset[i] for i in train] testset = [featureset[i] for i in test] print("before train") classifier = SklearnClassifier(sklearnclassifier).train(trainset) true = [label for features, label in testset] predicted = classifier.classify_many([features for features, label in testset]) precisions, recalls, fscores, support = precision_recall_fscore_support(true, predicted, pos_label=None, labels=uniqLabels) accuracy = accuracy_score(true, predicted) accuracySum += accuracy for label, value in zip(uniqLabels, precisions): precisionSums[label] += value for label, value in zip(uniqLabels, recalls): recallSums[label] += value for label, value in zip(uniqLabels, fscores): fscoreSums[label] += value print("Average accurancy: {0:.3f}".format(accuracySum/foldsCount)) measures = {label: (sum/foldsCount, recallSums.get(label)/foldsCount, fscoreSums.get(label)/foldsCount) for label, sum in precisionSums.items()} for label, (prec, recall, fscore) in measures.items(): print("Average precision for {0}: {1:.3f}".format(label, prec)) print("Average recall for {0}: {1:.3f}".format(label, recall)) print("Average f score for {0}: {1:.3f}".format(label, fscore))
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) #pred = classifier.batch_classify(test) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
def score(classifier): classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 classifier.train(train_set) #训练分类器 pred = classifier.classify_many([fea for (fea, tag) in test_set ]) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score([tag for (fea, tag) in test_set], pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier): #### classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 classifier.train(train) #训练分类器 #pred = classifier.classify_many(devtest) #对开发测试集的数据进行分类,给出预测的标签 pred = classifier.classify_many(dev) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) # pred = classifier.batch_classify(test) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
def score(classifier, train, testSet, tag_test): classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 classifier.train(train) #训练分类器 pred = classifier.classify_many(testSet) # classifier.prob_classify() # pred = classifier.batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag_test, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def main3(): from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import LinearSVC from sklearn.metrics import confusion_matrix from matplotlib import pyplot svm = SklearnClassifier(LinearSVC(loss="hinge")) svm.train(trainData) print("SVM: ", nltk.classify.accuracy(svm, testData)) results = svm.classify_many(item[0] for item in testData) print(results) from sklearn.metrics import classification_report # getting a full report print(classification_report(t_test_skl, results, labels=list(set(t_test_skl)), target_names=t_test_skl)) # Compute confusion matrix import numpy as np cmm = confusion_matrix([x[1] for x in testData], results) print(cmm) cmm = np.array(cmm, dtype = np.float) print(cmm.shape) #f=figure() #ax = f.add_subplot(111) #show() #%pylab inline # Show confusion matrix in a separate window print(pyplot.imshow(cmm, interpolation='nearest'))
def score(classifier): classifier = SklearnClassifier(classifier) #在nltk中使用scikit-learn的接口 classifier.train(train) #训练分类器 #print train pred = classifier.classify_many(data) #对测试集的数据进行分类,给出预测的标签 n = 0 m = 0 s = len(pred) for i in range(0, s): #print pred[i] if pred[i] == tag[i] and pred[i] == 'neg': n = n + 1 if pred[i] == 'neg': m = m + 1 return float(n) / float(m) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def classify_train_test(): # This function reads train data sets and calls TokenizePosChunk function to create # featureset and then uses the SVM classifier to train the model # It then reads test data set and calls TokenizePosChunk function to create # features for test data set and uses SVM classifier to predict class labels # trainDataFileList = ["train_1000.label", "train_2000.label", \ # "train_3000.label", "train_4000.label", \ # "train_5500.label"] trainDataFileList = ["train_1000.label"] # global features, cls_set, featuresets, searchChunk features = {} cls_set = [] featuresets = [] searchChunk = "" # process all train data sets and build featureset for trainDataFile in trainDataFileList: for line in open(trainDataFile, encoding="ISO-8859-1"): TokenizePosChunk(line) train = featuresets #store featureset as train # SVM with a Linear Kernel and default parameters classif = SklearnClassifier(LinearSVC()) classif.train(train) quest = " " quest = input("enter question (q! to quit)==> ") while (quest != "q!"): # initialize global variables for test dataset processing features = {} #cls_set = [] # no need to initialize class set for test featuresets = [] testques = [] searchChunk = "" testques.append(quest) TokenizePosChunkTest(quest) test = features #store features as test now #print ("search chunk aft call ", searchChunk) p = classif.classify_many(test) chunkVar = searchChunk classVar = q_dict[cls_set[p[0]]] print("Searching for => ", chunkVar, "\nQuestion Class => ", classVar) Tell_Me_Alfred(chunkVar, classVar) #print (cls_set) #print("\n") #print(p) print("\n") quest = input("enter question (q! to quit)==> ")
def coem(L1, L2, U1, U2): pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=100)), ('nb', MultinomialNB())]) classifier1 = SklearnClassifier(pipeline) classifier1.train(L1) # Predict on U using 1st classifier U1_labels = classifier1.classify_many(U1) # Trained on A classifier. # Now B will learn on L as well as A's labels on U iterations = 0 while iterations < 25: classifier2 = SklearnClassifier(pipeline) # Add everything in L L2_train = L2 # Add everything in U with labels from A for i, sub_bow in enumerate(U2): L2_train.append((sub_bow, U1_labels[i])) classifier2.train(L2_train) # Now, label U. U2_labels = classifier2.classify_many(U2) # Now, classifier 2 has finished labelling everything in U # Classifer 1 starts again # Again , add all mails in L L1_train = L1 # Add all mails in U, but with labels from B. (U2) for i, mail_bow in enumerate(U1): L1_train.append((mail_bow, U2_labels[i])) # Train it classifier1 = SklearnClassifier(pipeline) classifier1.train(L1_train) U1_labels = classifier1.classify_many(U1) #print U1_labels,U2_labels print labels_find_intersection(U1_labels,U2_labels) iterations += 1 return U1_labels
def get_fmeasure(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) pred = classifier.classify_many(test) acc_score = accuracy_score(tag_test, pred) recall_s = acc_score - 0.05 f_m = (2 * acc_score * recall_s) / (acc_score + recall_s) return f_m
def score(classifier): classifier = SklearnClassifier(classifier) # 在nltk中使用scikit-learn的接口 classifier.train(train_set) # 训练分类器 pred = classifier.classify_many(data) # 对测试集的数据进行分类,给出预测的标签 n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 return float(n) / float(s) # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train_set) pred = classifier.classify_many(words) n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 return n / s
def buildClassifier_score(trainSet,devtestSet,classifier): #print devtestSet from nltk import compat dev, tag_dev = zip(*devtestSet) #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签 classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 #x,y in list(compat.izip(*trainSet)) classifier.train(trainSet) #训练分类器 #help('SklearnClassifier.batch_classify') pred = classifier.classify_many(dev)#batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def buildClassifier_score(trainSet, devtestSet, classifier): #print devtestSet from nltk import compat dev, tag_dev = zip(*devtestSet) #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签 classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 #x,y in list(compat.izip(*trainSet)) classifier.train(trainSet) #训练分类器 #help('SklearnClassifier.batch_classify') pred = classifier.classify_many( dev) #batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def logisticClassify(train_data, test_data, language): training_vector = buildFeatureVector(train_data, language) test_vector = buildFeatureVector(test_data, language) log_classifier = SklearnClassifier(LogisticRegression()) log_classifier.train(training_vector) accuracy = nltk.classify.accuracy(log_classifier, test_vector) #classifier.show_most_informative_features() test_vector = list(zip(*test_vector))[0] classes = log_classifier.classify_many(test_vector) test_tweets = test_data.index.values return accuracy, dict(zip(test_tweets, classes))
def score(classifier, train, test): classifier = SklearnClassifier(classifier) # 在nltk中使用scikit-learn的接口 classifier.train(train) # 训练分类器 pred = classifier.classify_many(test) # 对测试集的数据进行分类,给出预测的标签 n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 print('准确度为: %f' % (n / s)) result = n / s return classifier, result
def score(classifier): classifier = SklearnClassifier(classifier) #在nltk中使用scikit-learn的接口 classifier.train(train) #训练分类器 pred = classifier.classify_many(data) #对测试集的数据进行分类,给出预测的标签 n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 return n / s
def linear_model(train_df, cv_df): train_tokenized_data = get_data_for_nltk(train_df) cv_tokenized_data = get_data_for_nltk(cv_df) print(train_tokenized_data[0], train_tokenized_data[100], train_tokenized_data[10000]) classifier = SklearnClassifier(SGDClassifier(loss='log', max_iter=5)) # Set up SGD Classifier via scikitlearn wrapper print("Training Classifier...") classifier.train(train_tokenized_data) print("Training Done") save_pickle(classifier, "linear_model.pickle") cv_accuracy = nltk.classify.accuracy(classifier, cv_tokenized_data) print("Cross Validation Accuracy", cv_accuracy) print(classification_report(cv_df['label'].tolist(), classifier.classify_many([x for (x,y) in cv_tokenized_data])))
def performTestValidation(trainset, testset, sklearnclassifier, uniqLabels): classifier = SklearnClassifier(sklearnclassifier).train(trainset) true = [label for features, label in testset] predicted = classifier.classify_many([features for features, label in testset]) precisions, recalls, fscores, support = precision_recall_fscore_support(true, predicted, pos_label=None, labels=uniqLabels) accuracy = accuracy_score(true, predicted) print("Test accuracy: {0:.3f}".format(accuracy)) measures = {label: (precision, recall, fscore) for label, precision, recall, fscore in zip(uniqLabels, precisions, recalls, fscores)} for label, (prec, recall, fscore) in measures.items(): print("Precision for {0}: {1:.3f}".format(label, prec)) print("Recall for {0}: {1:.3f}".format(label, recall)) print("F score for {0}: {1:.3f}".format(label, fscore))
def score(classifier, num): classifier = SklearnClassifier(classifier) # 在nltk中使用scikit-learn的接口 train, data, tag = train_and_test(num) classifier.train(train) # 训练分类器 pred = classifier.classify_many(data) # 对测试集的数据进行分类,给出预测的标签 # print(pred) n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 return n / s # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(cls, mclassifier, x_train, x_test): data, tag = zip(*x_test) # 分离测试集合的数据和标签,便于验证和测试 classifier = SklearnClassifier(mclassifier) classifier.train(x_train) print('===================================》》 Train done!') pos_index = [] neg_index = [] for i in range(0, len(tag)): if tag[i] == 'pos': # pos pos_index.append(i) # 记录所有pos的index,计算精确率 else: neg_index.append(i) pred = classifier.classify_many(data) # 给出预测的标签 print(type(pred), len(pred), len(tag)) n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 accu = n / s # 分类器准确率 print(accu) # print(pos_index, tag) tp = 0 # 将正类预测为正类的数目 fn = 0 # 将正类预测为负类的数目 fp = 0 # 将负类预测为正类的数目 tn = 0 # 将负类预测为负类的数目 for i in pos_index: if pred[i] == tag[i]: tp = tp + 1 else: fn = fn + 1 for i in neg_index: if pred[i] == tag[i]: tn = tn + 1 else: fp = fp + 1 print(tp, '--', fn, '--', fp, '--', tn) pos_precision = tp / (tp + fp) # pos的精确率 pos_recall = tp / (tp + fn) # pos的召回率 pos_f1 = (2 * pos_precision * pos_recall) / (pos_precision + pos_recall ) # pos的f1值 neg_precision = tn / (tn + fn) # neg的精确率 neg_recall = tn / (tn + fp) # neg的召回率 neg_f1 = (2 * neg_precision * neg_recall) / (neg_precision + neg_recall ) # neg的f1值 return accu, pos_precision, pos_recall, pos_f1, neg_precision, neg_recall, neg_f1
def train_model(classifier, name, printout = False): classifier = SklearnClassifier(classifier) classifier.train(trainData) #predict = classifier.classify_many(validSam) predict = classifier.classify_many(testSam) accuracy = accuracy_score(testTag, predict) if printout: print '*******模型: %s的测试结果*********' % name print '\n' print '%s`s accuracy is %f' % (name, accuracy) print '%s`s score report is \n' % name print classification_report(testTag, predict) print '%s`s confusion is \n' % name print confusion_matrix(testTag, predict) print '\n' model_file = data_path + name + ".pkl" pickle.dump(classifier, open(model_file, 'w')) return accuracy
def classifier_score(tp, classifier, train_list, test, test_tag): ''' 传入分类器进行分类 Output:pos_precision, pos_recall, accuracy_score ''' starttime = datetime.datetime.now() classifier = SklearnClassifier(classifier) classifier.train(train_list) iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl') pred = classifier.classify_many(test) # 返回的是结果集的list y_true = [1 if tag == 'pos' else 0 for tag in test_tag] y_pred = [1 if tag == 'pos' else 0 for tag in pred] pos_precision = precision_score(y_true, y_pred) pos_recall = recall_score(y_true, y_pred) endtime = datetime.datetime.now() interval = (endtime - starttime).microseconds interval = interval / 100 return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)
def classifier_score(tp, classifier, train_list, test, test_tag): ''' 传入分类器进行分类 Output:pos_precision, pos_recall, accuracy_score ''' starttime = datetime.datetime.now() classifier = SklearnClassifier(classifier) classifier.train(train_list) iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl') pred = classifier.classify_many(test) # 返回的是结果集的list y_true = [1 if tag == 'pos' else 0 for tag in test_tag] y_pred = [1 if tag == 'pos' else 0 for tag in pred] pos_precision = precision_score(y_true, y_pred) pos_recall = recall_score(y_true, y_pred) endtime = datetime.datetime.now() interval = (endtime - starttime).microseconds interval = interval / 100 return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)
def runClassifier(train, test, algo='LogisticRegression'): train_features = [] for co in train: train_features.append((co.featureset, co.isbug)) test_features = [] for c in test: if c is None: continue test_features.append(c.featureset) if algo == 'LogisticRegression': print 'LogisticRegression' try: from sklearn.linear_model.sparse import LogisticRegression except ImportError: # separate sparse LR to be removed in 0.12 from sklearn.linear_model import LogisticRegression classif = SklearnClassifier(LogisticRegression(C=1000)) else: # if not logistic, assume SVM for now # SVM with a Linear Kernel and default parameters from sklearn.svm import LinearSVC print 'svm' classif = SklearnClassifier(LinearSVC()) classif.train(train_features) try: p = classif.classify_many(test_features) except AttributeError: p = classif.batch_classify(test_features) test_commits = [] for idx, val in enumerate(p): t = test[idx] t.isbug = val test_commits.append(t) return test_commits
def run_stats(): #shuffle articles shuffle(allArticles) shuffle(allArticlesBinary) numArticles = len(allArticles) split = math.floor(numArticles * .75) trainingSet = allArticles[:split] test = allArticles[split:] bitrain = allArticlesBinary[:split] bitest = allArticlesBinary[split:] testSet = [] bitestSet = [] testAnswers = [] for item in test: testSet.append(item[0]) testAnswers.append(item[1]) for item in bitest: bitestSet.append(item[0]) multClassif = SklearnClassifier(MultinomialNB()) ti = time() multClassif.train(trainingSet) multRes = multClassif.classify_many(testSet) t0 = time() - ti multTime.append(t0) multAcc.append(accuracy_score(testAnswers, multRes)) calcPRF('mult', testAnswers, multRes) lrmult = SklearnClassifier(LogisticRegression()) ti = time() lrmult.train(trainingSet) logRes = lrmult.classify_many(testSet) t3 = time() - ti lrTime.append(t3) lrAcc.append(accuracy_score(testAnswers, logRes)) calcPRF('lr', testAnswers, logRes) pipe = Pipeline([('tfidf', TfidfTransformer()), #('chi2', SelectKBest(chi2, k=500)), ('nb', MultinomialNB())]) testClassif = SklearnClassifier(pipe) ti = time() testClassif.train(trainingSet) testres = testClassif.classify_many(testSet) t5 = time() - ti testTime.append(t5) testAcc.append(accuracy_score(testAnswers, testres)) calcPRF('test', testAnswers, testres) percepclass = SklearnClassifier(Perceptron()) ti = time() percepclass.train(trainingSet) precepres = percepclass.classify_many(testSet) t7 = time() - ti percepTime.append(t7) percepAcc.append(accuracy_score(testAnswers, precepres)) calcPRF('percep', testAnswers, precepres) berclass = SklearnClassifier(BernoulliNB()) ti = time() berclass.train(bitrain) berres = berclass.classify_many(bitestSet) t9 = time() - ti berTime.append(t9) berAcc.append(accuracy_score(testAnswers, berres)) calcPRF('ber', testAnswers, berres)
def learn_model(data,target): # preparing data for split validation. 60% training, 40% test state=43#randrange(1,23432)+123 print "statue 6857" print state data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.20,random_state=state) #classifier = BernoulliNB().fit(data_train,target_train) stop_word_dict={}#build_stop_word_dict() sentiment_dict={}#build_sentiment_dict() global hinfo_dict hinfo_dict=build_hinfo_dict(data,target) #print stop_word_dict.keys() raw_input("begin train") train_feature=[] test_feature=[] for i in range(len(data_train)): print i d=data_train[i] #d=jieba.cut(d, cut_all=False) l=target_train[i] tmp=[best_word_feats(d,stop_word_dict,sentiment_dict,hinfo_dict),l] train_feature.append(tmp) for i in range(len(data_test)): print i d=data_test[i] #d=jieba.cut(d, cut_all=False) l=target_test[i] tmp=best_word_feats(d,stop_word_dict,sentiment_dict,hinfo_dict) test_feature.append(tmp) #BernoulliNB MultinomialNB LogisticRegression SVC LinearSVC print "max_len %d"%(max_len) print "min_len %d"%(min_len) print "avg_len %d"%(sum/cnt) print "BernoulliNB" classifier = SklearnClassifier(BernoulliNB()) classifier.train(train_feature) print "--------------" print len(classifier._vectorizer.get_feature_names()) for f in classifier._vectorizer.get_feature_names(): print f.encode("utf-8") predicted = classifier.classify_many(test_feature) evaluate_model(target_test,predicted) ids=range(len(data_test)) result=[] for p in predicted: if p =='positive': result.append('1') else: result.append('-1') save_predict(data_test, ids, result, "BernoulliNB.xml") """
featuresets = chosen_features_limit train_set, test_set = featuresets[limit / 2:], featuresets[:limit / 2] svm = SklearnClassifier(LinearSVC()) svm.train(train_set) path = os.path.normpath('../model/svm/account_{0}/{1}/'.format(account_id, version)) if not os.path.exists(path): os.makedirs(path) print(u'Saving model to {0}'.format(path)) joblib.dump(svm, os.path.join(path, 'svm.pkl')) test_skl = [] t_test_skl = [] for d in test_set: test_skl.append(d[0]) t_test_skl.append(d[1]) # run the classifier on the test test p = svm.classify_many(test_skl) # getting a full report print classification_report( t_test_skl, p, labels=list(set(t_test_skl)), target_names=['pos', 'neg'] )
#Multinomial Naive Bayes classifier pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) classif.train(train_set) #Max entropy classifier """ classif = MaxentClassifier.train(train_set, 'megam') """ print(nltk.classify.accuracy(classif, test_set)) pred = classif.classify_many([feature for feature, sentiment in test_set]) test_true = [sentiment for feature, sentiment in test_set] matx = confusion_matrix(test_true,pred) print(matx) #joblib.dump(tweets, 'tweets.pkl') #joblib.dump(classif, 'classif.pkl') """ #Cross Validating Classifiers cv = cross_validation.KFold(len(featuresets), n_folds=5, shuffle=False, random_state=None)
class MNBayes(text_classifier.TextClassifier): def __init__(self,trainDir,labelFile,numTrees=10): self.classifier = None self.labelFile = labelFile self.trainingDir = trainDir self.labels = None self.all_words = None self.numTrees = numTrees self.classifier = SklearnClassifier(MultinomialNB()) #self.labels = training.setup(labelFile) #self.train() def train(self): feature_sets = self.getFeatures() self.classifier.train(feature_sets) """ Determines training error""" def trainingError(self): feature_sets = self.getFeatures() p = nltk.classify.accuracy(self.classifier,feature_sets) return p """ Make sure that the algorithm works on training data using a k fold cross validation scheme """ def kfoldCrossValidation(self,k): feature_sets = self.getFeatures() error = 0 for i in range(k): self.classifier = SklearnClassifier(MultinomialNB()) n = len(feature_sets)/k train_set,test_set = feature_sets[:n*i],feature_sets[n*i:] test_set1 = feature_sets[:n*i] train_set = feature_sets[n*i:n*(i+1)] test_set2 = feature_sets[i+1:] test_set = test_set1+test_set2 self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) return p """ Make sure that the algorithm works on training data using a leave one out cross validation scheme """ def leave1OutCrossValidation(self): error = 0 feature_sets = self.getFeatures() N = len(feature_sets) for i in range(N): self.classifier = SklearnClassifier(MultinomialNB()) train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:] train_set = train_set1+train_set2 test_set = [test_set] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) error+=p return error/N """ Construct a learning curve to see if there is overfitting""" def learningCurve(self,numTrials=4): accuracies = [] feature_sets = self.getFeatures() for k in xrange(1,len(feature_sets)-1): total = 0 for i in xrange(numTrials): self.classifier = SklearnClassifier(MultinomialNB()) random.shuffle(feature_sets) train_set,test_set = feature_sets[:k],feature_sets[k:] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) total+=p accuracies.append(total/numTrials) return accuracies """ Train on only k features and return training labels and predicted labels """ def testClassify(self,k): feature_sets = self.getFeatures() random.shuffle(feature_sets) self.classifier = SklearnClassifier(MultinomialNB()) self.classifier.train(feature_sets[k:]) features,ref_labels = zip(*feature_sets[:k]) pred_labels = self.classifier.classify_many(features) return ref_labels,pred_labels """ nltk confusion matrix """ def confusionMatrix(self,ref,test): ref.sort(key=lambda x: x[0]) test.sort(key=lambda x: x[0]) _,ref_labels = zip(*ref) _,test_labels = zip(*test) cm = ConfusionMatrix(ref_labels, test_labels) return cm """ Classifies proteins based on its text """ def classify(self,db,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") proteinID = toks[5] query_rows = genbank.proteinQuery(proteinID,db) ids,text = zip(*query_rows) text = ''.join(map(str,text)) if text=='': label = ['na'] else: text = word_reg.findall(text) featureset = self.gene_features(text) assert text!=prevText assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.batch_classify([featureset]) proIDs.append(proteinID) labels+=label return zip(proIDs,labels)
train_set[label] = [] train_set[label].append(FreqDist(newwordlist(mail))) pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('svm', LinearSVC())]) classif = SklearnClassifier(pipeline) add_label = lambda lst, lab: [(x, lab) for x in lst] finalset = [] for label,bow in train_set.iteritems(): finalset.extend(add_label(bow, label)) classif.train(finalset) conf = [] for l, bow in train_set.iteritems(): labels = np.array(classif.classify_many(bow)) row = [] for label in train_set: row.append((labels==label).sum()) conf.append(row) for c in conf: print c diagval = 0 total = 0 for i in range(len(conf)): for j in range(len(conf)): total += conf[i][j] if i==j: diagval += conf[i][j]
aww_neg_training = aww_neg_total[0:training] # get first 2000 negative posts aww_pos_testing = aww_pos_total[-testing:] # get last 100 positive posts aww_neg_testing = aww_neg_total[-testing:] total_1 = len(aww_pos_testing) total_2 = len(aww_neg_testing) training_data = [] for dictionary in aww_pos_training: training_data.append((dictionary, "pos")) for dictionary2 in aww_neg_training: training_data.append((dictionary2, "neg")) shuffle(training_data) print("classifying and training") classif = SklearnClassifier(MultinomialNB()).train(training_data) results_pos = classif.classify_many(aww_pos_testing) results_neg = classif.classify_many(aww_neg_testing) correct_1 = 0 correct_2 = 0 print("testing") for string in results_pos: if "pos" in string: correct_1 += 1 for string in results_neg: if "neg" in string: correct_2 += 1 print("For positive: %i correct out of %i for a percentage of %f" % (correct_1, total_1, correct_1/total_1))
SGDClassifier(max_iter=100), MultinomialNB(), SVC(kernel='linear') ] models = list((zip(names, classifiers))) nltk_ensemble = SklearnClassifier( VotingClassifier(estimators=models, voting='hard', n_jobs=-1)) nltk_ensemble.train(training) accuracy = nltk.classify.accuracy(nltk_model, testing) * 100 print("Voting Classifier: Accuracy: {}".format(accuracy)) # In[28]: # make class label prediction for testing set txt_features, labels = list(zip(*testing)) prediction = nltk_ensemble.classify_many(txt_features) # In[29]: # print a confusion matrix and a classification report print(classification_report(labels, prediction)) pd.DataFrame(confusion_matrix(labels, prediction), index=[['actual', 'actual'], ['Real', 'Fake']], columns=[['predicted', 'predicted'], ['Real', 'Fake']]) # In[ ]:
result.append(svm_result) result.append(dt_result) result.append(ent_result) result.append(nb_result) result.append(knn_result) #meta cl cv_trn2, cv_t2, all_trn2 = create_meta_training_data(result, label_list) svm_meta = SklearnClassifier(LinearSVC()).train(all_trn2) dt_meta = SklearnClassifier(tree.DecisionTreeClassifier()).train(all_trn2) ent_meta = nltk.classify.maxent.MaxentClassifier.train(all_trn2, trace=1, max_iter=4) nb_meta = nltk.NaiveBayesClassifier.train(all_trn2) knn_meta = SklearnClassifier(KNeighborsClassifier(5)).train(all_trn2) intermediate = [] intermediate.append(svm_base.classify_many(test_feature)) intermediate.append(dt_base.classify_many(test_feature)) intermediate.append(ent_base.classify_many(test_feature)) intermediate.append(nb_base.classify_many(test_feature)) intermediate.append(knn_base.classify_many(test_feature)) test_feature = merge_feature(intermediate) print compute_accuracy(test_label, svm_meta.classify_many(test_feature)) print compute_accuracy(test_label, dt_meta.classify_many(test_feature)) print compute_accuracy(test_label, ent_meta.classify_many(test_feature)) print compute_accuracy(test_label, nb_meta.classify_many(test_feature)) print compute_accuracy(test_label, knn_meta.classify_many(test_feature)) #svm_l2 = [] #dt_l2 = [] #nb_l2 = []
def main(): # Naive Bayes nb = nltk.NaiveBayesClassifier.train(trainData) print("NB: ", nltk.classify.accuracy(nb, testData)) from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import BernoulliNB, GaussianNB # BernoulliNB bernoulli = SklearnClassifier(BernoulliNB()) bernoulli.train(trainData) print("NB Bernoulli: ", nltk.classify.accuracy(bernoulli, testData)) #gaussian = SklearnClassifier(GaussianNB()) #gaussian.train(trainData.toarray(trainData)) #print("Gaussian: ", nltk.classify.accuracy(gaussian, testData)) from sklearn.naive_bayes import MultinomialNB # MultinomialNB multi = SklearnClassifier(MultinomialNB()) multi.train(trainData) print("NB Multinomial: ", nltk.classify.accuracy(multi, testData)) from sklearn.feature_selection import SelectKBest, chi2 from sklearn.pipeline import Pipeline pipeline = Pipeline([('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) pmulti = SklearnClassifier(pipeline) pmulti.train(trainData) print("NB Multinomial (pmulti): ", nltk.classify.accuracy(pmulti, testData)) from sklearn.metrics import f1_score #results = pmulti.batch_classify(item[0] for item in testData) #print(results[:10]) #print(f1_score([item[1] for item in testData], results)) # Logistic Reggression from sklearn.linear_model import LinearRegression #linReg = SklearnClassifier(LinearRegression()) #linReg.train((trainData)) #print("Logistic Reggression: ", nltk.classify.accuracy(linReg, testData)) # Ensembles # Random forest from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier forest = SklearnClassifier(RandomForestClassifier(n_estimators=100)) forest.train(trainData) print("Random Forest: ", nltk.classify.accuracy(forest, testData)) # AdaBoost adaboost = SklearnClassifier(AdaBoostClassifier()) adaboost.train(trainData) print("Adaboost: ", nltk.classify.accuracy(adaboost, testData)) # Support Vector Machines from sklearn.svm import LinearSVC from sklearn.metrics import confusion_matrix svm = SklearnClassifier(LinearSVC(loss="hinge")) svm.train(trainData) print("SVM: ", nltk.classify.accuracy(svm, testData)) svm = SklearnClassifier(LinearSVC(loss="hinge")) svm.train(trainData) print("SVM: ", nltk.classify.accuracy(svm, testData)) results = svm.classify_many(item[0] for item in testData) print(results) # KMeans from sklearn.cluster import KMeans km = SklearnClassifier(KMeans()) km.train(trainData) print("KMeans: ", nltk.classify.accuracy(km, testData)) # K nearest neighbors from sklearn.neighbors import KNeighborsClassifier k = 5 knn = SklearnClassifier(KNeighborsClassifier(n_neighbors=k)) knn.train(trainData) print("KNN 1: ", nltk.classify.accuracy(knn, testData))
def clf_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train_set) predict = classifier.classify_many(test) return accuracy_score(tag_test, predict)
test_skl = [] t_test_skl = [] for d in test_set: test_skl.append(d[0]) t_test_skl.append(d[1]) # cls_set = list(set(t_test_skl)) # SVM with a Linear Kernel and default parameters classifier = SklearnClassifier(LinearSVC()) for train_set_size in range(1000, corpus_size, 300): print("Train set size:", train_set_size) # Split in train train_set = feature_sets[test_set_size:train_set_size] classifier.train(train_set) # run the classifier on the train test p = classifier.classify_many(test_skl) # getting a full report print classification_report(t_test_skl, p) cm = nltk.ConfusionMatrix(p,t_test_skl) print(cm.pretty_format(sort_by_count=True, show_percents=True))
words.read_files(file_name) dictionary = dict(words) test_data_1.append(dictionary) for directory in os.listdir(test_fp2): subdir = test_fp2 + directory if directory.startswith("."): continue for string in os.listdir(subdir): file_name = test_fp2 + directory + '/' + string words = Document() words.read_files(file_name) dictionary = dict(words) test_data_2.append(dictionary) results_1 = classif.classify_many(test_data_1) results_2 = classif.classify_many(test_data_2) total_1 = len(results_1) total_2 = len(results_2) correct_1 = 0 correct_2 = 0 for string in results_1: if "children" in string: correct_1 += 1 for string in results_2: if "advanced" in string: correct_2 += 1 print("For children text: %i correct out of %i for a percentage of %f" % (correct_1, total_1, correct_1/total_1)) print("For advanced text: %i correct out of %i for a percentage of %f" % (correct_2, total_2, correct_2/total_2)) print("total classification percentage is %f" % ((correct_1 + correct_2)/(total_1 + total_2)))
def output(): #form = ReusableForm(request.form) #if request.method == 'GET': name = request.args.get('textQuery') #name=request.form['textQuery'] #name=request.form.getlist('textQuery') #if form.validate(): selected_features = None stopwords = ['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'herself', 'had', 'should', 'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him', 'nor', 'did', 'this', 'she', 'each', 'further', 'where', 'few', 'because', 'doing', 'some', 'are', 'our', 'ourselves', 'out', 'what', 'for', 'while', 'does', 'above', 'between', 't', 'be', 'we', 'who', 'were', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 's', 'or', 'own', 'into', 'yourself', 'down', 'your', 'from', 'her', 'their', 'there', 'been', 'whom', 'too', 'themselves', 'was', 'until', 'more', 'himself', 'that', 'but', 'don', 'with', 'than', 'those', 'he', 'me', 'myself', 'these', 'up', 'will', 'below', 'can', 'theirs', 'my', 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if', 'again', 'no', 'when', 'same','how', 'other', 'which', 'you', 'after', 'most', 'such', 'why', 'a', 'off', 'i', 'yours', 'so', 'the', 'having','once'] def add_lexical_features(fdist, feature_vector, text): feature_vector["len"] = len(text) text_nl = nltk.Text(text) for word, freq in fdist.items(): fname = word if selected_features == None or fname in selected_features: #feature_vector[fname] = text_nl.count(word) feature_vector[fname] = 1 def features(review_words): feature_vector = {} uni_dist = nltk.FreqDist(review_words) my_bigrams = list(bigrams(review_words)) bi_dist = nltk.FreqDist(my_bigrams) add_lexical_features(uni_dist,feature_vector, review_words) return feature_vector with open("dataV11.txt", 'rb') as f: text = f.read() text = text.decode("utf-8") f.close() docs = text.split("\n") docs2 = docs[1: ] train = [] #print(sent) for d in docs2: d = d.split() if len(d)!=0: cl = d[0] text_d = d[1: ]#we need to remove the stopwords text = [] for w in text_d: if w not in stopwords: text.append(w) item = (text, cl) train.append(item) random.seed(0) random.shuffle(train) #print(sentences) train_set = train[ :3271] valid_set = train[3272: ] featuresets_tr = [(features(words), label) for (words, label) in train_set ] featuresets_val = [(features(words), label) for (words, label) in valid_set ] featuresets = [(features(words), label) for (words, label) in train ] from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import MultinomialNB,BernoulliNB MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(featuresets_tr) MNB_classifier.train(featuresets) BNB_classifier = SklearnClassifier(BernoulliNB()) BNB_classifier.train(featuresets_tr) BNB_classifier.train(featuresets) #n = int(input()) print(">>>>>>>") print(name) a = [a_temp for a_temp in name.strip().split(' ')] #for a_i in range(1): # to read a matrix #a_t = [a_temp for a_temp in input().strip().split(' ')] #a.append(a_t) print("<<<<<") print(a) inputData = ",".join(map(str, a)) print("Input data:",inputData) featuresets_test = [features(words) for words in a ] #features = ",".join(map(str, featuresets_test)) print("Features:",featuresets_test) #a = [] #a.append("doi1") #featuresets_test = [] #featuresets_test.append("nanomaterial") predicted_labels = BNB_classifier.classify_many(featuresets_test) #print(a) #print(featuresets_test) for l in predicted_labels: print (str("Type of input data: "+l)) #print(type(a)) #print('Input:') #print(a) #print(type(predicted_labels)) #print('Category:') #print(predicted_labels) #print(type(featuresets_test)) #print('Features:') #print(featuresets_test) outputData = [] import csv csvfile = open('result.csv', 'w') with csvfile: #for (col1, col2, col3) in zip(a, predicted_labels, featuresets_test): #outputData.append([col1, col2, col3]) data1 = [["asif", "kary", "ravi"], ["xyz", "abc", "def"]] outputData = [[name, l, featuresets_test]] valueWriter = csv.writer(csvfile) valueWriter.writerows(outputData) #valueWriter.writerows([str(a),str(l),str(featuresets_test)]) #csv = valueWriter.writerows(data) return render_template('results.html')
def clf_score(classifier): classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 classifier.train(train_set) #训练分类器 predict = classifier.classify_many(test) #对开发测试集的数据进行分类,给出预测的标签 return precision_recall_fscore_support(tag_test,predict)
ent_base = [] result = [] svm_result = [] nb_result = [] dt_result = [] ent_result = [] svm_base = SklearnClassifier(LinearSVC()).train(all_trn) dt_base = SklearnClassifier(tree.DecisionTreeClassifier()).train(all_trn) nb_base = nltk.NaiveBayesClassifier.train(all_trn) ent_base = nltk.classify.maxent.MaxentClassifier.train(all_trn, trace=1, max_iter=4) knn_base = SklearnClassifier(KNeighborsClassifier(5)).train(all_trn) #test intermediate = [] intermediate.append(svm_base.classify_many(test_feature)) intermediate.append(dt_base.classify_many(test_feature)) intermediate.append(nb_base.classify_many(test_feature)) intermediate.append(ent_base.classify_many(test_feature)) intermediate.append(knn_base.classify_many(test_feature)) #final_feature = merge_feature(intermediate) #print compute_accuracy(test_label, svm_meta.classify_many(final_feature)) #print compute_accuracy(test_label, max_ent_meta.classify_many(final_feature)) #Weight: weight = [] weight.append(0.9) weight.append(0.8) weight.append(0.8) weight.append(0.8)
classifier = SklearnClassifier(model) # set priors classifier._encoder.fit([category, "no"]) # [category, "no"] unless this is true then ["no", category] flip = classifier.labels()[0] == "no" categorized_proportion = len([words for (words, categories) in corpus if category in categories]) * 1.0 / len(corpus) if flip: model.class_prior = [1-categorized_proportion, categorized_proportion] else: model.class_prior = [categorized_proportion, 1-categorized_proportion] classifier.train(train_set) # test classifier test_results = classifier.classify_many([feat for (feat, label) in test_set]) pos_test_set = set(i for i, result in enumerate(test_results) if result == category) reference_values = [label for (feat, label) in test_set] pos_ref_set = set(i for i, (feat, label) in enumerate(test_set) if label == category) accuracy = scores.accuracy(reference_values, test_results) accuracies.append(accuracy) precision = scores.precision(pos_ref_set, pos_test_set) recall = scores.recall(pos_ref_set, pos_test_set) f1 = scores.f_measure(pos_ref_set, pos_test_set) f1_scores.append(f1) print "%s: accuracy %s, precision %s, recall %s, F1 %s" % (colored(category, "blue"), colored(accuracy, "yellow"), colored(precision, "yellow"), colored(recall, "yellow"), colored(f1, "yellow")) ## print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # print ""
def final_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainSet) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
class ScikitClassifierAdapter: """ An adapter for an SklearnClassifier (nltk.classify.scikitlearn) object to make sure that all classifiers take same input and return the same output and are trained in the same way. scikit_classifier: a Scikit classifier *instance* train_file_name: the path to the training settings template_file_name: the template to extract additional feature for optimization purposes """ def __init__(self, scikit_classifier, train_file_name,template_file_name,labelled_feature_sets=None): from nltk.classify.scikitlearn import SklearnClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier fe = FeatureExtractor() #self.classifier = SklearnClassifier(scikit_classifier,sparse=False) if(isinstance(scikit_classifier,RandomForestClassifier)): self.classifier = SklearnClassifier(scikit_classifier,sparse=False) elif(isinstance(scikit_classifier,GaussianNB)): self.classifier = SklearnClassifier(scikit_classifier,sparse=False) else: self.classifier = SklearnClassifier(scikit_classifier) self.compiled_templates = self.process_template(template_file_name) feature_sets = [] if(labelled_feature_sets is not None): feature_sets = labelled_feature_sets logger.info("using a pre-computed feature_sets containing %i instances"%len(feature_sets)) else: iob_data = file_to_instances(train_file_name) logger.info("instances ",len(iob_data)) logger.info("tokens",count_tokens(iob_data)) for n,instance in enumerate(iob_data): sentence_n = n pos_tags = [('z_POS',token[1]) for token in instance] labels = [token[2] for token in instance] tokens = [token[0] for token in instance] for n,token in enumerate(tokens): dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0] feature_sets.append([dict_features, labels[n]]) self.classifier.train(self.apply_feature_template(feature_sets,out_label=True)) return def classify(self,feature_sets): """ Args: feature_sets: a list of dictionaries like the following: [{'a_token': u'Nella', 'b_punct': 'OTHERS', 'c_brackets': 'OTHERS', 'd_case': 'INIT_CAPS', 'e_number': 'NO_DIGITS', 'f_ngram_1': u'N', 'f_ngram_2': u'Ne', 'f_ngram_3': u'Nel', 'f_ngram_4': u'Nell', 'g_ngram_1': u'a', 'g_ngram_2': u'la', 'g_ngram_3': u'lla', 'g_ngram_4': u'ella', 'h_lowcase': u'nella', 'i_str-length': '5', 'l_pattern': 'Aaaaa', 'm_compressed-pattern': 'Aa', 'n_works_dictionary': 'OTHERS', 'z': '_'}, ... ] Returns: result: a list of dictionaries, where each dictionary corresponds to a token, [{'features': [], 'id': 37, 'label': 'O', 'probs': {'B-AAUTHOR': {'alpha': '234.113833', 'beta': '-2.125040', 'prob': '0.000262'}, }, 'token': '.'},...] """ # apply feature templates (from CRF++) template_feature_sets = self.apply_feature_template(feature_sets,out_label=False) # keep the output labels output_labels = self.classifier.classify_many(template_feature_sets) result = [] for n,feature_set in enumerate(feature_sets): temp = {} temp["token"]=feature_set["a_token"].encode('utf-8') temp["label"]=str(output_labels[n]) result.append(temp) return result def process_template(self,template_file): """ Example of the output: [('U01:%s', [(-2, 0)]), ('U02:%s', [(-1, 0)]),...] """ f = open(template_file,'r') lines = [line.replace('\n','') for line in f.readlines() if not line.startswith('\n') and not line.startswith('#') and not line.startswith('B')] f.close() import re exp = re.compile("%x\[(-?\d+),(-?\d+)\]") result = [] for line in lines: result.append((exp.sub('%s',line),[(int(match[0]),int(match[1])) for match in exp.findall(line)])) return result def apply_feature_template(self,feature_sets,out_label=False): """ TODO: apply each of the compiled templates """ def get_value(feature_sets,token_n,feature_n): if(token_n < 0): return "ND" elif(token_n > (len(feature_sets)-1)): return "ND" else: return feature_sets[token_n][feature_n] if(out_label): unlabelled_feature_sets = [[f[0][key] for key in sorted(f[0])] for f in feature_sets] else: unlabelled_feature_sets = [[f[key] for key in sorted(f)] for f in feature_sets] assert len(feature_sets) == len(unlabelled_feature_sets) new_features = [] for n,fs in enumerate(unlabelled_feature_sets): result = {} for template,replacements in self.compiled_templates: template_name = template.split(":")[0] template = template.split(":")[1] values = [get_value(unlabelled_feature_sets,n+r[0],r[1]) for r in replacements] result[template_name] = template%tuple(values) if(out_label): # keep the expected label for training new_features.append([result,feature_sets[n][1]]) else: new_features.append(result) return new_features