class SKClassifier: classifier = None def __init__(self, cls='SVC'): self.classifier = SklearnClassifier({ 'SVC': SVC(), 'LogisticRegression': LogisticRegression(), 'BernoulliNB': BernoulliNB() }[cls]) if not self.classifier: self.classifier = SklearnClassifier(SVC()) def train(self, trainset): self.classifier.train(trainset) def test(self, tagged, featuresets): predict = self.classifier.classify_many(featuresets) print predict return accuracy_score(tagged, predict) def classify(self, featureset): return self.classifier.classify(featureset) def classify_many(self, featuresets): return self.classifier.classify_many(featuresets)
def classifier_for_lemma(lemma, filenames): # XXX: always doing non-null and Random Forest for initial version classifier = SklearnClassifier(RandomForestClassifier(), sparse=False) print("loading training data for", lemma) load_training_for_word(lemma, filenames.bitextfn, filenames.alignfn, filenames.annotatedfn) training = trainingdata.trainingdata_for(lemma, nonnull=True) print("got {0} instances for {1}".format(len(training), lemma)) # delete the sentences themselves; we have the instances trainingdata.set_examples([], []) trainingdata.set_sl_annotated([]) gc.collect() if len(training) > (20 * 1000): print("capping to 20k instances to fit in memory") training = training[: 20 * 1000] labels = set(label for (feat,label) in training) print("loaded training data for", lemma) if (not training) or len(labels) < 2: return None classifier.train(training) return classifier
def trainClassifiers(tweets): # Generate the training set training_set = nltk.classify.util.apply_features(extract_features, tweets) print("Training set created!") # Train and save the Naive Bayes classifier to a file NBClassifier = nltk.NaiveBayesClassifier.train(training_set) f = open('data/trained_classifiers/NBClassifier.pickle', 'wb') pickle.dump(NBClassifier, f, 1) f.close() print("NBClassifier Classifier Trained") #Train linear SVC linear_SVC_classifier = SklearnClassifier(LinearSVC()) linear_SVC_classifier.train(training_set) # Train Max Entropy Classifier # MaxEntClassifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'IIS', trace=2, \ # encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter = 5) # f = open('data/trained_classifiers/MaxEntClassifier.pickle', 'wb') # pickle.dump(MaxEntClassifier, f, 1) # f.close() # print("MaxEntClassifier Classifier Trained") # return (training_set, NBClassifier, MaxEntClassifier) return (training_set, NBClassifier, linear_SVC_classifier)
def learn_model(data,target): bestwords = best_of_words(data, target) # preparing data for split validation. 80% training, 20% test data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43) #classifier = BernoulliNB().fit(data_train,target_train) train_feature=[] test_feature=[] for i in range(len(data_train)): d=data_train[i] d=jieba.cut(d, cut_all=False) l=target_train[i] #tmp=[bigram(d),l] tmp = [dict([(word, True) for word in d if word in bestwords]), l] train_feature.append(tmp) for i in range(len(data_test)): d=data_test[i] d=jieba.cut(d, cut_all=False) l=target_test[i] #tmp=bigram(d) tmp = dict([(word, True) for word in d if word in bestwords]) test_feature.append(tmp) classifier = SklearnClassifier(MultinomialNB()) classifier.train(train_feature) predicted = classifier.classify_many(test_feature) evaluate_model(target_test,predicted) return classifier, bestwords
def clf_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train_set) # nltk.classify.scikitlearn(BernoulliNB()) predict = classifier.classify_many(test) # classifier.prob_classify_many() return accuracy_score(tag_test, predict)
def sentiment_classifier(debug): # trainingfp = open('training.csv', 'rb') train = pd.read_csv( 'training.csv', delimiter=',', quotechar='"', escapechar='\\',header=0 ) num_tweets = train['TweetText'].size cleantweets = [] for i in xrange(0, num_tweets): if debug and ( (i+1)%1000 == 0 ): print "Tweet %d of %d\n" % ( i+1, num_tweets ) cleantweets.append((tweet_to_words(train['TweetText'][i]), train['Sentiment'][i])) # vectorizer = CountVectorizer(analyzer = "word", \ # tokenizer = None, \ # preprocessor = None, \ # stop_words = None, \ # max_features = 5000) # train_data_features = vectorizer.fit_transform([t for (t,_) in cleantweets]) # feature_labels = [(m,l) for ((f,l),m) in zip(cleantweets, train_data_features)] # forest = RandomForestClassifier(n_estimators = sensitivity) # forest = forest.fit(train_data_features, train['Sentiment']) classif = SklearnClassifier(LinearSVC()) classif.train(cleantweets) return (classif)
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) # pred = classifier.batch_classify(test) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n): classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV'] allclassifiers = [] for name in classnames: for i in range(n): random.shuffle(featureslist) train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION) if name == 'Naive Bayes': spamclassifier = NaiveBayesClassifier.train(train_set) if name == 'Logistic Regression': spamclassifier = SklearnClassifier(LogisticRegression()) spamclassifier.train(train_set) if name == 'Linear SCV': spamclassifier = SklearnClassifier(LinearSVC(C=0.01)) spamclassifier.train(train_set) perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name) if i == 0: perfmeasures_n = perfmeasures_i else: perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i) # Store last classifier built per model allclassifiers.append(spamclassifier) # Print performance measures per classifier printperformance(name, perfmeasures_n, n) return allclassifiers
def evaluate(train_qs, test_qs, params, d): data = [train_qs, test_qs] (W, b, W2, b2, W3, b3, L) = params train_feats = [] test_feats = [] for tt, split in enumerate(data): for qs, ans in split: prev_qs = zeros((d, 1)) prev_sum = zeros((d, 1)) count = 0. history = [] for dist in qs: sent = qs[dist] # input is average of all nouns in sentence # av = average(L[:, sent], axis=1).reshape((d, 1)) history += sent prev_sum += sum(L[:, sent], axis=1).reshape((d, 1)) if len(history) == 0: av = zeros((d, 1)) else: av = prev_sum / len(history) # apply non-linearity p = relu(W.dot(av) + b) p2 = relu(W2.dot(p) + b2) p3 = relu(W3.dot(p2) + b3) curr_feats = {} for dim, val in ndenumerate(p3): curr_feats['__' + str(dim)] = val if tt == 0: train_feats.append( (curr_feats, ans[0]) ) else: test_feats.append( (curr_feats, ans[0]) ) print 'total training instances:', len(train_feats) print 'total testing instances:', len(test_feats) random.shuffle(train_feats) # can modify this classifier / do grid search on regularization parameter using sklearn classifier = SklearnClassifier(LogisticRegression(C=10)) classifier.train(train_feats) print 'accuracy train:', nltk.classify.util.accuracy(classifier, train_feats) print 'accuracy test:', nltk.classify.util.accuracy(classifier, test_feats) print '' print 'dumping classifier' cPickle.dump(classifier, open('data/deep/classifier', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def evaluate(classifier_alo): classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口 classifier.train(trainFeatures) #训练分类器 referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) i = 0 for item in testFeatures: referenceSets[item[1]].add(i) predicted = classifier.classify(item[0]) testSets[predicted].add(i) i += 1 pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos']) pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos']) neg_pre = nltk.metrics.precision(referenceSets['neg'], testSets['neg']) neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg']) print (str('{0:.3f}'.format(float(pos_pre))) + " " +str('{0:.3f}'.format(float(pos_recall))) + " " +str('{0:.3f}'.format(float(neg_pre))) + " " +str( '{0:.3f}'.format(float(neg_recall))) + " " +str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + " " +str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
def validate(data, params, d): stop = stopwords.words("english") (rel_dict, Wv, b, L) = params print "validating, adding lookup" for split in data: for tree in split: for node in tree.get_nodes(): node.vec = L[:, node.ind].reshape((d, 1)) train_feats = [] val_feats = [] for tt, split in enumerate(data): if tt == 0: print "processing train" else: print "processing val" for num_finished, tree in enumerate(split): # process validation trees forward_prop(None, params, tree, d, labels=False) ave = zeros((d, 1)) words = zeros((d, 1)) count = 0 wcount = 0 word_list = [] for ex, node in enumerate(tree.get_nodes()): if ex != 0 and node.word not in stop: ave += node.p_norm count += 1 ave = ave / count featvec = ave.flatten() curr_feats = {} for dim, val in ndenumerate(featvec): curr_feats["_" + str(dim)] = val if tt == 0: train_feats.append((curr_feats, tree.ans)) else: val_feats.append((curr_feats, tree.ans)) print "training" classifier = SklearnClassifier(LogisticRegression(C=10)) classifier.train(train_feats) print "predicting..." train_acc = nltk.classify.util.accuracy(classifier, train_feats) val_acc = nltk.classify.util.accuracy(classifier, val_feats) return train_acc, val_acc
def svm(train_data,preprocessing=True): training_data = [] for data in train_data: training_data.append(preprocess(data[0],label=data[1])) cl = SklearnClassifier(LinearSVC()) cl.train(training_data) return cl
class chatBot(object): def __init__(self): self.posts = nltk.corpus.nps_chat.xml_posts() self.categories = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'] self.mapper = [0, 2, 6, 3, 11, 5, 8, 1, 8, 3, 10, 11, 13, 13, 13] self.responses = {} self.featuresets = [] self.train = [] self.test = [] self.testSet = [] self.testSetClass = [] self.classif = SklearnClassifier(LinearSVC()) for i in range(0, 15): self.responses[i] = [] for post in self.posts: self.featuresets.append((self.tokenize(post.text),self.categories.index(post.get('class')))) self.temp = self.responses[self.categories.index(post.get('class'))] self.temp.append(post.text) def tokenize(self, sentence): """ Extracts a set of features from a message. """ features = {} tokens = nltk.word_tokenize(sentence) for t in tokens: features['contains(%s)' % t.lower()] = True return features def talk(self): while 1: inp = raw_input("YOU: ") features = self.tokenize(inp) pp = self.classif.classify_many(features) pp = pp[0] pp = int(pp) m = self.mapper[pp] r = self.responses[m] val = randint(0, len(r)) print("BOT: "+r[val]) def trainSet(self): shuffle(self.featuresets) size = int(len(self.featuresets) * .1) # 10% is used for the test set self.train = self.featuresets[size:] self.test = self.featuresets[:size] self.classif.train(self.train) self.testSet = [] self.testSetClass = [] for i in self.test: self.testSet.append(i[0]) self.testSetClass.append(i[1]) self.batch = self.classif.classify_many(self.testSet) def statistics(self): print (classification_report(self.testSetClass, self.batch, labels=list(set(self.testSetClass)),target_names=self.categories))
def main3(): from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import LinearSVC from sklearn.metrics import confusion_matrix from matplotlib import pyplot svm = SklearnClassifier(LinearSVC(loss="hinge")) svm.train(trainData) print("SVM: ", nltk.classify.accuracy(svm, testData)) results = svm.classify_many(item[0] for item in testData) print(results) from sklearn.metrics import classification_report # getting a full report print(classification_report(t_test_skl, results, labels=list(set(t_test_skl)), target_names=t_test_skl)) # Compute confusion matrix import numpy as np cmm = confusion_matrix([x[1] for x in testData], results) print(cmm) cmm = np.array(cmm, dtype = np.float) print(cmm.shape) #f=figure() #ax = f.add_subplot(111) #show() #%pylab inline # Show confusion matrix in a separate window print(pyplot.imshow(cmm, interpolation='nearest'))
def SVM(training_set, test_set): classifier = SklearnClassifier(LinearSVC()) print("Training a new SVM classifier") classifier.train(training_set) print("Accuracy of SVM in training:",nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) #print("Running new Decision Tree classifier") accuracy = nltk.classify.accuracy(classifier, test_set) trueLabels = [l for d, l in test_set] predictedLabels = classifier.classify_many([d for d,t in test_set]) #print("Accuracy:",accuracy) # classifier.show_most_informative_features(MIF) def runTrained(test_set, hasTags=False): #print("Running pre-trained Decision Tree classifier") if hasTags: tagglessTest_set = [data for data, tag in test_set] acc = nltk.classify.accuracy(classifier, test_set) print("Accuracy:", acc) predictions = classifier.classify_many(tagglessTest_set) return ([e for e in zip(tagglessTest_set, predictions)], acc) else: tagglessTest_set = test_set predictions = classifier.classify_many(tagglessTest_set) #print("Predicted Labels:",predictions) return [e for e in zip(tagglessTest_set, predictions)] return (runTrained, accuracy, predictedLabels, trueLabels)
def train(cleanedDataCollection, tagPool): posSamples = [] negSamples = [] featuresets = [(extractFeatures(d,tagPool), c) for (d,c) in cleanedDataCollection] for sample in featuresets: if sample[1] == "trash": negSamples.append(sample) else: posSamples.append(sample) train_set = negSamples[10:]+posSamples[10:] test_set = negSamples[:10]+posSamples[:10] # classifier = nltk.NaiveBayesClassifier.train(train_set) # print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # return classifier sk_classifier = SklearnClassifier(MultinomialNB()) sk_classifier.train(train_set) print "accuracy is: %s" % (accuracy(sk_classifier, test_set)) precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier, test_set, "useful") print "precision is: %s" % (precision) print "recall is: %s" % (recall) print "F-measure is: %s" % (fMeasure) return sk_classifier
def score(trainset, testset, classifier): classifier = SklearnClassifier(classifier) classifier._vectorizer.sort = False classifier.train(trainset) (test, tag_test) = zip(*testset) pred = classifier.classify_many(test) return accuracy_score(tag_test, pred)
def svm(trainfeats, testfeats): y = [] accuracy = [] classif = SklearnClassifier(LinearSVC(C=0.032)) classif.train(trainfeats) print "SVM output" print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) y.append( nltk.classify.util.accuracy(classif, testfeats)) print y
def svm(total_train_feats,total_test_feats): y = [] accuracy = [] classifier = SklearnClassifier(LinearSVC(C=0.032)) classifier.train(total_train_feats) print 'train on %d instances, test on %d instances' % (len(total_train_feats), len(total_test_feats)) y.append( nltk.classify.util.accuracy(classifier, total_test_feats)) print y del classifier all_results.append(y)
def buildClassifier_score(trainSet,devtestSet,classifier): #print devtestSet from nltk import compat dev, tag_dev = zip(*devtestSet) #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签 classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 #x,y in list(compat.izip(*trainSet)) classifier.train(trainSet) #训练分类器 #help('SklearnClassifier.batch_classify') pred = classifier.classify_many(dev)#batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def store_classifier(clf, trainset, filepath): classifier = SklearnClassifier(clf) classifier.train(trainset) pred = classifier.prob_classify_many(extract_features(sentiment)) p_file = open(filepath,'w+') #把结果写入文档 # for i in pred: # p_file.write(str(i.prob('pos'))+' '+str(i.prob('neg'))) for (i,j) in zip(pred,sen_cur): p_file.write(str(i.prob('pos'))+'\t'+str(i.prob('neg'))+'\t'+j + '\n') p_file.close()
def train(): pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=40)), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) pos = [FreqDist(i) for i in open('/home/mel/workspace/datascience/assignment5_kaggle/data/useful.txt', 'r').readlines()] neg = [FreqDist(i) for i in open('/home/mel/workspace/datascience/assignment5_kaggle/data/not.txt', 'r').readlines()] add_label = lambda lst, lab: [(x, lab) for x in lst] classif.train(add_label(pos, 'pos') + add_label(neg, 'neg')) return classif
def learn_model(self,featuresets): """ trains and tests the logistic regression classifier on the data """ random.shuffle(featuresets) limit = int(0.75*len(featuresets)) #partitioning 3:1 for train:test train_set = featuresets[:limit] test_set = featuresets[limit:] lr_classifier = SklearnClassifier(LogisticRegression()) lr_classifier.train(train_set) print 'Logistic classifier Accuracy : ',str(nltk.classify.accuracy(lr_classifier,test_set)*100)
def handle(self, *args, **options): trains = get_train_tweets() if not trains: raise CommandError('No train data, please add some from the admin page!') train_count = trains.count() train_set = generate_trainset(trains) nb_classifier = nltk.NaiveBayesClassifier.train(train_set) sci_classifier = SklearnClassifier(LinearSVC()) sci_classifier.train(train_set) while True: unclassified_tweets = Tweet.objects.filter(train=False, klass=None) total_count = unclassified_tweets.count() if total_count > 0: print('Classifying %d tweets...' % total_count) counts_nb = defaultdict(int) counts_svm = defaultdict(int) start_time = time.time() for tweet in unclassified_tweets: feature_vect = get_feature_vector(process_tweet(tweet.body)) features = extract_features(feature_vect) sentiment_nb = nb_classifier.classify(features) sentiment_svm = sci_classifier.classify(features) counts_nb[sentiment_nb] += 1 counts_svm[sentiment_svm] += 1 tweet.klass = sentiment_nb tweet.klass_svm = sentiment_svm msg_nb = ['%d %s' % (counts_nb[k], v) for k, v in Tweet.CLASSES] msg_svm = ['%d %s' % (counts_svm[k], v) for k, v in Tweet.CLASSES] print('\rNB: ' + ', '.join(msg_nb) + ';\tSVM: ' + ', '.join(msg_svm), end='') # print('\r' + ', '.join(msg_nb), end='') tweet.save() if settings.DEBUG: db.reset_queries() elapsed = int(time.time() - start_time) print('\nClassifying finished in %d seconds.' % elapsed) new_trains = get_train_tweets() if new_trains.count() != train_count: print('Train set has been changed, retraining...') trains = new_trains train_count = new_trains.count() train_set = generate_trainset(trains) nb_classifier = nltk.NaiveBayesClassifier.train(train_set) sci_classifier = SklearnClassifier(LinearSVC()) sci_classifier.train(train_set) else: print('Waiting...') time.sleep(3)
def get_performance(clf_sel, train_features, test_features): ref_set = collections.defaultdict(set) test_set = collections.defaultdict(set) classification_error = False clf = SklearnClassifier(clf_sel) try: classifier = clf.train(train_features) except: classification_error = True # print (str(clf_sel.__class__),'NA') if str(clf_sel.__class__) == "<class 'sklearn.naive_bayes.MultinomialNB'>": pickle_cls(classifier, 'MultinomialNB') # print(str(clf_sel), 'accuracy:'(nltk.classify.accuracy(classifier, test_features)) * 100) if not classification_error: clf_acc = nltk.classify.accuracy(classifier, test_features) for i, (features, label) in enumerate(test_features): ref_set[label].add(i) predicted = classifier.classify(features) test_set[predicted].add(i) pos_precision = precision(ref_set['pos'], test_set['pos']) pos_recall = recall(ref_set['pos'], test_set['pos']) neg_precision = precision(ref_set['neg'], test_set['neg']) neg_recall = recall(ref_set['neg'], test_set['neg']) print( "{0},{1},{2},{3},{4},{5}".format(clf_sel.__class__, clf_acc, pos_precision, pos_recall, neg_precision, neg_recall))
def classifier_for_lemma(lemma): # always doing non-null and Random Forest for initial version classifier = SklearnClassifier(RandomForestClassifier(), sparse=False) training = trainingdata.trainingdata_for(lemma, nonnull=True) print("got {0} instances for {1}".format(len(training), lemma)) if len(training) > (20 * 1000): print("capping to 20k instances to fit in memory") training = training[: 20 * 1000] labels = set(label for (feat,label) in training) print("loaded training data for", lemma) if (not training) or len(labels) < 2: return None classifier.train(training) return classifier
def trainPosNeg(self): positive = "./positive" negative = "./negative" pos_files = ptr(positive, '.*') neg_files = ptr(negative, '.*') pos_all_words = [pos_files.raw(fileid).split(" ") for fileid in pos_files.fileids()] neg_all_words = [neg_files.raw(fileid).split(" ") for fileid in neg_files.fileids()] pos_splited_words = [(self.getBigrams(words), 'positive') for words in pos_all_words] neg_splited_words = [(self.getBigrams(words), 'negative') for words in neg_all_words] pos_neg_trainfeats = pos_splited_words[:] + neg_splited_words[:] classifier = SklearnClassifier(LinearSVC()) classifier.train(pos_neg_trainfeats) return classifier
def cross_validation(data_set, n_folds=8): kf = KFold(len(data_set), n_folds=n_folds) best_accuracy = -1 training_accuracy = 0 for train, cv in kf: classifier = SklearnClassifier( Pipeline([('tfidf', TfidfTransformer()), ('nb', LinearSVC(C=1, tol=0.000001))])) training_data = data_set[0:cv[0]] + data_set[cv[-1]:] cv_data = data_set[cv[0]:cv[-1]+1] classifier.train(training_data) accuracy = classify.accuracy(classifier, cv_data) if accuracy > best_accuracy: best_classifier = classifier best_accuracy = accuracy training_accuracy = classify.accuracy(classifier, training_data) return best_classifier, training_accuracy, best_accuracy
def cross_validate(data,model=None): training_set = nltk.classify.apply_features(preprocess,data) cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None) if model == "svm" or model=="SVM": svm = SklearnClassifier(LinearSVC()) for traincv, testcv in cv: classifier = svm.train(training_set[traincv[0]:traincv[len(traincv)-1]]) print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]])
def TrainClassifiers(): training_set, testing_set = TestTrainData() classifiers = list() classifier_name = list() NaiveBayesClassifier_classifier = NaiveBayesClassifier.train(training_set) classifiers.append(NaiveBayesClassifier_classifier) classifier_name.append("NaiveBayesClassifier") MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) classifiers.append(MNB_classifier) classifier_name.append("MultinomialNBClassifier") BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) classifiers.append(BernoulliNB_classifier) classifier_name.append("BernoulliNBClassifier") LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) classifiers.append(LogisticRegression_classifier) classifier_name.append("LogisticRegressionClassifier") LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) classifiers.append(LogisticRegression_classifier) classifier_name.append("LinearSVCClassifier") SGDC_classifier = SklearnClassifier(SGDClassifier()) SGDC_classifier.train(training_set) classifiers.append(SGDC_classifier) classifier_name.append("SGDClassifier") print("Naive_Bayes Algo accuracy percent:", (classify.accuracy(NaiveBayesClassifier_classifier, testing_set))*100) print("MNB_classifier accuracy percent:", (classify.accuracy(MNB_classifier, testing_set))*100) print("BernoulliNB_classifier accuracy percent:", (classify.accuracy(BernoulliNB_classifier, testing_set))*100) print("LogisticRegression_classifier accuracy percent:", (classify.accuracy(LogisticRegression_classifier, testing_set))*100) print("LinearSVC_classifier accuracy percent:", (classify.accuracy(LinearSVC_classifier, testing_set))*100) print("SGDClassifier accuracy percent:", (classify.accuracy(SGDC_classifier, testing_set))*100) SaveClassifiers(classifiers, classifier_name) return classifiers
def train(trainfeats, testfeats, nlt=True, skl=True, most=10): # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) nltk_output = dict() sklearn_output = dict() if nlt: my_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = my_classifier.classify(feats) testsets[observed].add(i) # precision and recall accuracy = 0 pos_prec = 0 pos_rec = 0 neg_prec = 0 neg_rec = 0 try: accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100 pos_prec = precision(refsets[4], testsets[4]) * 100 pos_rec = recall(refsets[4], testsets[4]) * 100 neg_prec = precision(refsets[0], testsets[0]) * 100 neg_rec = recall(refsets[0], testsets[0]) * 100 except Exception as e: print(e) pass # round # accuracy = round(accuracy, 1) # pos_prec = round(pos_prec, 1) # pos_rec = round(pos_rec, 1) # neg_prec = round(neg_prec, 1) # neg_rec = round(neg_rec, 1) # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos'])) # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg'])) # my_classifier.show_most_informative_features(most) nltk_output['accuracy'] = round(accuracy, 1) nltk_output['pos_prec'] = round(pos_prec, 1) nltk_output['neg_prec'] = round(neg_prec, 1) nltk_output['pos_rec'] = round(pos_rec, 1) nltk_output['neg_rec'] = round(neg_rec, 1) nltk_output['most1'] = my_classifier.most_informative_features()[0][0] nltk_output['most2'] = my_classifier.most_informative_features()[1][0] nltk_output['most3'] = my_classifier.most_informative_features()[2][0] if skl: MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier._vectorizer.sort = False MNB_classifier.train(trainfeats) mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100 # mnb = round(mnb, 1) # print(mnb) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier._vectorizer.sort = False BernoulliNB_classifier.train(trainfeats) bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100 # bnb = round(bnb, 1) # print(bnb) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier._vectorizer.sort = False LogisticRegression_classifier.train(trainfeats) lr = (nltk.classify.accuracy(LogisticRegression_classifier, testfeats)) * 100 # lr = round(lr, 1) # print(lr) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier._vectorizer.sort = False LinearSVC_classifier.train(trainfeats) lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100 # lsvc = round(lsvc, 1) # print(lsvc) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier._vectorizer.sort = False NuSVC_classifier.train(trainfeats) nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100 # nsvc = round(nsvc, 1) # print(nsvc) voted_classifier = VoteClassifier(NuSVC_classifier, LinearSVC_classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier) voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100 # voted = round(voted, 1) sklearn_output['mnb'] = round(mnb, 1) sklearn_output['bnb'] = round(bnb, 1) sklearn_output['lr'] = round(lr, 1) sklearn_output['lsvc'] = round(lsvc, 1) sklearn_output['nsvc'] = round(nsvc, 1) sklearn_output['voted'] = round(voted, 1) return (nltk_output, sklearn_output)
class Classifier: def __init__(self, trainNew, trainDirectory): self.data = {} self.labels = ["bearish", "bullish", "neutral"] self.trainSet = [] self.testSet = [] self.featureList = [] self.featureSets = [] self.trainDir = trainDirectory if not trainNew and exists("./data/models/svm.pickle"): self.model = pickle.load(open('./data/models/svm.pickle', 'rb')) else: print("Training model") self.model = SklearnClassifier(LinearSVC(random_state=0, tol=1e-5)) self.load_data() self.extract_features() self.train_model() # end def load_data(self): self.data = {} for label in self.labels: file = open(self.trainDir + label + ".txt", 'r') for line in file: self.data[line] = label return self.data # end def extract_features(self): if (len(self.data) == 0): self.load_data() f = FeatureExtractor() counter = 0 for tweet, label in self.data.items(): featureVector = f.get_feature_vector(tweet) if len(featureVector) > 0: self.featureSets.append( (featureVector, label) ) # dictionary of [bigrams in a tweet] : sentiment of that tweet self.featureList = self.featureList + featureVector # list of all bigrams, later gets repeats removed self.trainSet.append( (dict([(tuple(word), True) for word in featureVector]), label)) counter += 1 print(len(self.featureSets), "tweets total") self.featureList = list(set(tuple(i) for i in self.featureList)) print(len(self.featureList), "unique features") print(len(self.trainSet), "training tweets") return self.trainSet # end def train_model(self): if (len(self.trainSet) == 0): self.extract_features() self.model.train(self.trainSet) pickle.dump(self.model, open("data/models/svm.pickle", "wb")) return self.model # end def classify(self, tweetText): f = FeatureExtractor() tweetText = str(tweetText) featureVector = f.get_feature_vector(tweetText) features = dict([(tuple(word), True) for word in featureVector]) prediction = self.model.classify(features) return prediction
###使用测试集测试分类器的最终效果 test, tag_test = zip(*testSet) def final_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train) pred_1 = classifier.classify_many(test) return accuracy_score(tag_test, pred_1) trainSet = posFeatures + negFeatures BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(trainSet) pickle.dump(BernoulliNB_classifier, open('classifier1.pkl', 'wb+')) def getgood(): print('BernoulliNB`s accuracy is %f' % score(BernoulliNB())) print(final_score(BernoulliNB())) # 使用开发集中得出的最佳分类器 print('MultinomiaNB`s accuracy is %f' % score(MultinomialNB())) print(final_score(MultinomialNB())) print('LogisticRegression`s accuracy is %f' % score(LogisticRegression())) print(final_score(LogisticRegression())) print('SVC`s accuracy is %f' % score(SVC())) print(final_score(SVC())) print('LinearSVC`s accuracy is %f' % score(LinearSVC())) print(final_score(LinearSVC())) print('NuSVC`s accuracy is %f' % score(NuSVC()))
def store_classifier(clf, trainset, filepath): classifier = SklearnClassifier(clf) classifier.train(trainset) # use pickle to store classifier pickle.dump(classifier, open(filepath,'w'))
def clf_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train_set) predict = classifier.batch_classify(test) return accuracy_score(tag_test, predict)
for word in word_features: features['contains(%s)' % word] = (word in document_words) return features featuresets = [(extract_features(d), c) for (d, c) in tweets] print('featuresets: ', len(featuresets)) train_set, test_set = featuresets[:1900], featuresets[1900:] #Multinomial Naive Bayes classifier pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) classif.train(train_set) #Max entropy classifier """ classif = MaxentClassifier.train(train_set, 'megam') """ print(nltk.classify.accuracy(classif, test_set)) pred = classif.classify_many([feature for feature, sentiment in test_set]) test_true = [sentiment for feature, sentiment in test_set] matx = confusion_matrix(test_true, pred) print(matx) #joblib.dump(tweets, 'tweets.pkl') #joblib.dump(classif, 'classif.pkl') """
b for t in tweets for b in zip(t.split(" ")[:-1], t.split(" ")[1:]) ] b_features = Counter(all_bigrams).most_common(500) bigram_features = [] for (bigram, freq) in b_features: bigram_features.append(bigram) def find_features(single_tweet): words = set(single_tweet) features = {} for w in bigram_features: features[w] = (w in words) return features featuresets = [(find_features(tweets), stances) for (tweets, stances) in tweets_with_labels] random.shuffle(featuresets) training_set = featuresets[:500] testing_set = featuresets[501:] MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classifier accuracy percentage:", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) print("SVC_classifier accuracy percentage:", (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)
from sklearn.svm import SVC from nltk.classify.scikitlearn import SklearnClassifier # In[34]: svc = SVC() classifier_sklearn = SklearnClassifier(svc) # In[35]: classifier_sklearn.train(training_data) # In[36]: nltk.classify.accuracy(classifier_sklearn, testing_data) # In[37]: from sklearn.ensemble import RandomForestClassifier # In[38]:
## Load classifier fh_in = open("naivebayes.pickle", "rb") ## Open pickle file to read, rb = read in bytes classifier = pickle.load(fh_in) ## load classifer fh_in.close() ## close pickle file ################################## sk-learn classifiers ################################################## from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC MNB_clf = SklearnClassifier(MultinomialNB()) MNB_clf.train(training_set) print("MNB_classifier accuracy", nltk.classify.accuracy(MNB_clf, test_set)) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier accuracy:", (nltk.classify.accuracy(BernoulliNB_classifier, test_set)) * 100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set)) * 100) #### Naive Bayes classification ## Prepare data - could have doen this above but analyses below forced to make a new loop x = []
def buildFeatures(tokenized): for p in short_pos.split('\n'): documents.append( (p, "pos") ) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) for p in short_neg.split('\n'): documents.append( (p, "neg") ) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) save_documents = open("documents.pickle","wb") pickle.dump(documents, save_documents) save_documents.close() all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:5000] save_word_features = open("word_features5k.pickle","wb") pickle.dump(word_features, save_word_features) save_word_features.close() def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets) print(len(featuresets)) testing_set = featuresets[10000:] training_set = featuresets[:10000] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Original Naive Bayes Algo accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) ############### save_classifier = open("originalnaivebayes5k.pickle","wb") pickle.dump(classifier, save_classifier) save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classifier accuracy percent:",(nltk.classify.accuracy(MNB_classifier, testing_set))*100) save_classifier = open("MNB_classifier5k.pickle","wb") pickle.dump(MNB_classifier, save_classifier) save_classifier.close() BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier accuracy percent:",(nltk.classify.accuracy(BernoulliNB_classifier,testing_set))*100) save_classifier = open("BernoulliNB_classifier5k.pickle","wb") pickle.dump(BernoulliNB_classifier, save_classifier) save_classifier.close() LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier accuracy percent:",(nltk.classify.accuracy(LogisticRegression_classifier,testing_set))*100) save_classifier =open("LogisticRegression_classifier5k.pickle","wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier accuracy percent:",(nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) save_classifier = open("LinearSVC_classifier5k.pickle","wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() SGDC_classifier = SklearnClassifier(SGDClassifier()) SGDC_classifier.train(training_set) print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier,testing_set)*100) save_classifier = open("SGDC_classifier5k.pickle","wb") pickle.dump(SGDC_classifier, save_classifier) save_classifier.close()
#classifier = nltk.NaiveBayesClassifier.train(train_set) classifier_f = open("naivebayes.pickle", "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("Original Naive Bayes Accuracy Percentage: ", (nltk.classify.accuracy(classifier, test_set)) * 100) classifier.show_most_informative_features(30) #save_classifier = open("naivebayes.pickle", "wb") #pickle.dump(classifier, save_classifier) #save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(train_set) print("MNB_classifier Accuracy Percentage: ", (nltk.classify.accuracy(MNB_classifier, test_set)) * 100) BernoulliNB = SklearnClassifier(BernoulliNB()) BernoulliNB.train(train_set) print("BernoulliNB Accuracy Percentage: ", (nltk.classify.accuracy(BernoulliNB, test_set)) * 100) LogisticRegression = SklearnClassifier(LogisticRegression()) LogisticRegression.train(train_set) print("LogisticRegression Accuracy Percentage: ", (nltk.classify.accuracy(LogisticRegression, test_set)) * 100) SGDClassifier = SklearnClassifier(SGDClassifier()) SGDClassifier.train(train_set)
print("here") featuresets = [(find_features(rev), category) for (rev, category) in documents] '''DecisionTreeClassifier_classifier = SklearnClassifier(tree.DecisionTreeClassifier()) DecisionTreeClassifier_classifier.train(training_set) print(nltk.classify.accuracy(DecisionTreeClassifier_classifier, testing_set))''' print("here") accuracy_sum = 0 for j in range(0, 10): random.shuffle(featuresets) testing_set = featuresets[1900:] training_set = featuresets[:1900] # classifier = OpinionLexiconClassifier() # accuracy = nltk.classify.accuracy(classifier) NuSVC_classifier = SklearnClassifier(NuSVC(nu=0.8)) NuSVC_classifier.train(training_set) accuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set) accuracy_sum += accuracy print("NuSVC_classifier accuracy percent:", str(accuracy * 100)) print("Average of the ten accuracies with top 4000 features:", str(accuracy_sum / 10))
featureSets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featureSets) # training_set = featureSets[:750] # testing_set = featureSets[750:] training_set = featureSets[:360] testing_set = featureSets[360:] NB_classifier = nltk.NaiveBayesClassifier.train(training_set) # classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) LR_classifier = SklearnClassifier(LogisticRegression()) LR_classifier.train(training_set) # stochastic gradient descent SGD_classifier = SklearnClassifier(SGDClassifier()) SGD_classifier.train(training_set) SV_classifier = SklearnClassifier(SVC()) SV_classifier.train(training_set) LSV_classifier = SklearnClassifier(LinearSVC()) LSV_classifier.train(training_set) RF_classifier = SklearnClassifier(RandomForestClassifier())
def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets) print(len(featuresets)) testing_set = featuresets[10000:] training_set = featuresets[:10000] training_set = featuresets[9000:10000] LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100) save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle", "wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() print("Execution time: ") print((time.clock() - start_time))
# uncomment below line - Initial training classifier = nltk.NaiveBayesClassifier.train(training_set) # save classifier to a file dump.dump(classifier, "naivebayes") print("naive Bayes accuracy: ", nltk.classify.accuracy( classifier, testing_set) * 100) # classifier.show_most_informative_features(10) # MultinomialNB, BernoulliNB # train & save MultinomialNB_classifier = SklearnClassifier(MultinomialNB()) MultinomialNB_classifier.train(training_set) dump.dump(MultinomialNB_classifier, "MNB") print("MultinomialNB_classifier accuracy: ", nltk.classify.accuracy( MultinomialNB_classifier, testing_set) * 100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) dump.dump(BernoulliNB_classifier, "BNB") print("BernoulliNB_classifier accuracy: ", nltk.classify.accuracy( BernoulliNB_classifier, testing_set) * 100) # LogisticRegression, SGDClassifier # SVC, LinearSVC, NuSVC # LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets) training_set = featuresets[:17500] test_set = featuresets[17500:] #Fitting Naive Bayes classifier = nltk.NaiveBayesClassifier.train(training_set) print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, test_set)) * 100) classifier.show_most_informative_features(15) #Fitting Multinomial Naive Bayes MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MultinomialNB accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_set)) * 100) #Fitting Bernoulli Naive Bayes BNB_classifier = SklearnClassifier(BernoulliNB()) BNB_classifier.train(training_set) print("BernoulliNB accuracy percent:", (nltk.classify.accuracy(BNB_classifier, test_set)) * 100) #Fitting Logistic Regression LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_set)) * 100)
words = word_tokenize(sent) features = {} for w in BOW: features[w] = (w in words) return features train += test NBclassifier = nltk.NaiveBayesClassifier.train(train) #print("orginal NB accuracy",(nltk.classify.accuracy(NBclassifier,test))*100) MNBclassifier = SklearnClassifier(MultinomialNB()) MNBclassifier.train(train) #print("classifier accuracy",(nltk.classify.accuracy(MNBclassifier,test))*100) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(train) #print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test))*100) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(train) #print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test))*100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(train) #print("BernoulliNB_classifier accuracy",(nltk.classify.accuracy(BernoulliNB_classifier,test))*100 print("Trained")
testing_set = featuresets[10000:] training_set = featuresets[:10000] classifier = nltk.NaiveBayesClassifier.train(training_set) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) ############### save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb") pickle.dump(classifier, save_classifier) save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb") pickle.dump(MNB_classifier, save_classifier) save_classifier.close() BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb") pickle.dump(BernoulliNB_classifier, save_classifier) save_classifier.close() LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
# * De labels zijn categorieen (dus de event nummers in ons geval) featureSet = [({ 'Dit': 1.1, 'zijn': 0.9, 'features': 0.4, "testFeature": False }, 1), ({ 'Dit': 1.1, 'ook': 1.0, "testFeature": True }, 2)] # train de NLTK Naive Bayes classifier NLTK_NB = NaiveBayesClassifier.train(featureSet) # train de Scikit Learn MultinomialNB Classifier SCI_NB.train(featureSet) # train de Scikit Learn SVM Classifier SVM.train(featureSet) classifier = SVM # Even de classifier uitproberen print(classifier.classify({"Dit": 1.1, "zijn": 0.9})) # 1 print(classifier.classify({ "zijn": 0.5, "Dit": 1.1, "testFeature": True, })) # 2 """ # * OPTIONEEL # TEST DIT MET MEER DATA: vind de beste parameters voor de SVM (automatisch) met GridSearch # Weet nog niet echt zeker of dit gaat werken. Eerst maar eens zien of het uberhaupt allemaal
classifier=nltk.NaiveBayesClassifier.train(training_set) print("original_accuracy by naive_bayes:",nltk.classify.accuracy(classifier,test_set)*100) classifier.show_most_informative_features(15) save_classifier=open("originalnaivebayes.pickle","wb") pickle.dump(classifier,save_classifier) save_classifier.close() #MNB CLASSIFIER MNB_classifier=SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("mnb_accuracy:",nltk.classify.accuracy(MNB_classifier,test_set)*100) save_classifier=open("MNB_classifier.pickle","wb") pickle.dump(MNB_classifier,save_classifier) save_classifier.close() #USING bernoulliCLASSSIFIER BN_classifier=SklearnClassifier(BernoulliNB()) BN_classifier.train(training_set) print("bn_accuracy:",nltk.classify.accuracy(BN_classifier,test_set)*100) save_classifier=open("BernoulliNB_classifier.pickle","wb") pickle.dump(BernoulliNB_classifier,save_classifier) save_classifier.close()
classifier_l2 = pickle.load(f) """# Using Scikit-Learn API""" from nltk.classify.scikitlearn import SklearnClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier from sklearn.svm import SVC MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(train_set) print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(MNB_classifier, test_set)*100)) # GNB_classifier = SklearnClassifier(GaussianNB()) # GNB_classifier.train(train_set) # print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(GNB_classifier, test_set)*100)) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(train_set) print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(LogisticRegression_classifier, test_set)*100)) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(train_set) print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(SVC_classifier, test_set)*100)) SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
def final_score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train) pred_1 = classifier.classify_many(test) return accuracy_score(tag_test, pred_1)
def score(classifier,train_set,test,tag_test): classifier = SklearnClassifier(classifier) classifier.train(train_set) pred = classifier.batch_classify(test) return accuracy_score(tag_test, pred)
testing_set = [] training_set = [] dates = [] #split the sets into training and testing sets for n in (featuresets): #adding training data and +/- for training_set.append([dict(n[1]), n[2]]) for line in test_featuresets: testing_set.append(ast.literal_eval(line[1])) #train data classifier = nltk.NaiveBayesClassifier.train(training_set) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) #Maxent_classifier = SklearnClassifier(MaxentClassifier()) #Maxent_classifier.train(training_set) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) RandomForest_classifier = SklearnClassifier( RandomForestClassifier(n_estimators=100)) RandomForest_classifier.train(training_set) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
for (rev, category) in documents] training_set = feature_sets[:1900] testing_set = feature_sets[1900:] ######################################################################################################################## # # Naive-Bayes (posterior = (prior occurrences * likelihood) / evidence) classifier = nltk.NaiveBayesClassifier.train(training_set) accuracy = nltk.classify.accuracy(classifier, testing_set) * 100 print("Original NB Accuracy: ", accuracy) classifier.show_most_informative_features() # # Multinomial Naive-Bayes MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) multi_accuracy = nltk.classify.accuracy(MNB_classifier, testing_set) * 100 print("\nMultinomial NB Accuracy: ", multi_accuracy) # # Bernoulli Naive-Bayes Bern_classifier = SklearnClassifier(BernoulliNB()) Bern_classifier.train(training_set) bern_accuracy = nltk.classify.accuracy(Bern_classifier, testing_set) * 100 print("\nBernoulli NB Accuracy: ", bern_accuracy) # # Logistic Regression logistic_regression_classifier = SklearnClassifier(LogisticRegression()) logistic_regression_classifier.train(training_set) log_accuracy = nltk.classify.accuracy(logistic_regression_classifier,
def test(): short_pos = open('resources/positive.txt', 'r', errors='ignore').read() short_neg = open('resources/negative.txt', 'r', errors='ignore').read() documents = [] for r in short_pos.split("\n"): documents.append((r, 'pos')) for r in short_neg.split("\n"): documents.append((r, 'neg')) all_words = [] short_pos_words = word_tokenize(short_pos) short_neg_words = word_tokenize(short_neg) for w in short_pos_words: all_words.append(w.lower()) for w in short_neg_words: all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:5000] def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features features_set = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(features_set) training_set = features_set[:10000] testing_set = features_set[10000:] # clf = nltk.NaiveBayesClassifier.train(training_set) clf_file = open("resources/nb_basic_classifier1.pickle", "rb") clf = pickle.load(clf_file) clf_file.close() print("Naive Bayes Algo accuracy score:", nltk.classify.accuracy(clf, testing_set)) clf.show_most_informative_features(15) MNB_clf = SklearnClassifier(MultinomialNB()) MNB_clf.train(training_set) print("MNB_clf Algo accuracy score:", nltk.classify.accuracy(MNB_clf, testing_set)) BernoulliNB_clf = SklearnClassifier(BernoulliNB()) BernoulliNB_clf.train(training_set) print("BernoulliNB_clf Algo accuracy score:", nltk.classify.accuracy(BernoulliNB_clf, testing_set)) LogisticRegression_clf = SklearnClassifier(LogisticRegression()) LogisticRegression_clf.train(training_set) print("LogisticRegression_clf Algo accuracy score:", nltk.classify.accuracy(LogisticRegression_clf, testing_set)) SGDClassifier_clf = SklearnClassifier(SGDClassifier()) SGDClassifier_clf.train(training_set) print("SGDClassifier_clf Algo accuracy score:", nltk.classify.accuracy(SGDClassifier_clf, testing_set)) # SVC_clf = SklearnClassifier(SVC()) # SVC_clf.train(training_set) # print("SVC_clf Algo accuracy score:", nltk.classify.accuracy(SVC_clf, testing_set)) LinearSVC_clf = SklearnClassifier(LinearSVC()) LinearSVC_clf.train(training_set) print("LinearSVC_clf Algo accuracy score:", nltk.classify.accuracy(LinearSVC_clf, testing_set)) NuSVC_clf = SklearnClassifier(NuSVC()) NuSVC_clf.train(training_set) print("NuSVC_clf Algo accuracy score:", nltk.classify.accuracy(NuSVC_clf, testing_set)) vote_clf = VoteClassifier(clf, MNB_clf, BernoulliNB_clf, LogisticRegression_clf, SGDClassifier_clf, LinearSVC_clf, NuSVC_clf) print("vote_clf Algo accuracy score:", nltk.classify.accuracy(vote_clf, testing_set))
print("Starting the first round of training") print("Length of featureset is:", len(featureset)) for i in range(0, 30): random.shuffle(featureset) testing_set = featureset[(int(len(featureset) * 0.9)):] training_set = featureset[:(int(len(featureset) * 0.9))] start_time = gettime.time() NuSVClassifier = SklearnClassifier( NuSVC(nu=0.8, decision_function_shape="ovr")) NuSVClassifier.train(training_set) NuSVClassifier_accuracy = nltk.classify.accuracy(NuSVClassifier, testing_set) print("NuSVC done.") RFC = SklearnClassifier( RandomForestClassifier(n_estimators=25, min_samples_leaf=6)) RFC.train(training_set) RFC_accuracy = nltk.classify.accuracy(RFC, testing_set) print("RFC done.") # OLC = OpinionLexiconClassifier() # OLC_accuracy = nltk.classify.accuracy(OLC, testing_set)
testing_set = featuresets[:100] classfier = nltk.NaiveBayesClassifier.train( traning_set) #using naive bayes algo to classify pos or neg movie reviews #classfier_f = open("NaiveBayesSentiment.pickle","rb") #loading the trained model using pickle #classfier = pickle.load(classfier_f) #classfier_f.close() print("ORiginal Naive Bayes Algorithm Accuracy Percent : ", (nltk.classify.accuracy(classfier, testing_set)) * 100) #calculating accuracy of th model classfier.show_most_informative_features(30) MNB_classfier = SklearnClassifier(MultinomialNB()) MNB_classfier.train(traning_set) print("Multinomial Naive Bayes Algorithm Accuracy Percent : ", (nltk.classify.accuracy(MNB_classfier, testing_set)) * 100) #calculating accuracy of th model B_classfier = SklearnClassifier(BernoulliNB()) B_classfier.train(traning_set) print("Bernoulli Naive Bayes Algorithm Accuracy Percent : ", (nltk.classify.accuracy(B_classfier, testing_set)) * 100) #calculating accuracy of th model #LogisticRegression,SGDClassifier #SVC,LinearSVC,NuSVC
fe = find_features(movie_reviews.words('neg/cv000_29416.txt')) featuresets = [(find_features(rev), category) for (rev, category) in documents] #or rev in documents: # featuresets.append((find_features(rev))) training_set = featuresets[:1900] testing_set = featuresets[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("accuracy : ", (nltk.classify.accuracy(classifier, testing_set))) classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("mnb classifier accuracy status is: ", (nltk.classify.accuracy(MNB_classifier, training_set))) #Gau_classifier = SklearnClassifier(GaussianNB()) #Gau_classifier.train(training_set) #print("gaussian classifier acccuracy is:",(nltk.classify.accuracy(Gau_classifier,training_set))) bernoulliNB_classifier = SklearnClassifier(BernoulliNB()) bernoulliNB_classifier.train(training_set) print("bernoulli classifier acccuracy is:", (nltk.classify.accuracy(bernoulliNB_classifier, training_set))) #LinearRegression, SGDClassifier LinearRegression_classifier = SklearnClassifier(LinearRegression())
def train_and_test_classifiers(train_set, test_set): classifier = nltk.NaiveBayesClassifier.train(train_set) print("Classic Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(classifier, test_set)) * 100) # classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier( MultinomialNB(alpha=0.01, fit_prior=False)) MNB_classifier.train(train_set) print("Multinomial Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_set)) * 100) print("Skipping Gaussian Bayes Classifier accuracy percent") # GNB_classifier = SklearnClassifier(GaussianNB()) # GNB_classifier.fit(features_train, target_train) # target_pred = clf.predict(features_test) # GNB_classifier.train(train_set) # print("Gaussian Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(GNB_classifier, test_set))*100) BNB_classifier = SklearnClassifier(BernoulliNB(alpha=.01)) BNB_classifier.train(train_set) print("Bernoulli Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(BNB_classifier, test_set)) * 100) LG_classifier = SklearnClassifier(LogisticRegression(random_state=42)) LG_classifier.train(train_set) print("Logistic Regression Classifier accuracy percent:", (nltk.classify.accuracy(LG_classifier, test_set)) * 100) # Train SGD with hinge penalty SGD_classifier1 = SklearnClassifier( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=1000, tol=None)) # SGD_classifier = SklearnClassifier(SGDClassifier(alpha=0.0005, max_iter=1000)) SGD_classifier1.train(train_set) print("Stochastic Gradient Descent Classifier 1 accuracy percent:", (nltk.classify.accuracy(SGD_classifier1, test_set)) * 100) # Train SGD with Elastic Net penalty SGD_classifier2 = SklearnClassifier( SGDClassifier(alpha=1e-3, random_state=42, penalty="elasticnet", max_iter=1000, tol=None)) SGD_classifier2.train(train_set) print("Stochastic Gradient Descent Classifier 2 accuracy percent:", (nltk.classify.accuracy(SGD_classifier2, test_set)) * 100) # print("Skipping C-Support Vector Classifier") # print("Skipping Linear-Support Vector Classifier") SVC_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set) SVC_classifier.train(train_set) print("C-Support Vector Classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set)) * 100) LinearSVC_classifier1 = SklearnClassifier( SVC(kernel='linear', probability=True, tol=1e-3)) LinearSVC_classifier1.train(train_set) print("Linear Support Vector Classifier 1 accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier1, test_set)) * 100) LinearSVC_classifier2 = SklearnClassifier( LinearSVC("l1", dual=False, tol=1e-3)) LinearSVC_classifier2.train(train_set) print("Linear Support Vector Classifier 2 accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier2, test_set)) * 100) LinearSVC_classifier3 = SklearnClassifier( LinearSVC("l2", dual=False, tol=1e-3)) LinearSVC_classifier3.train(train_set) print("Linear Support Vector Classifier 3 accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier3, test_set)) * 100) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(train_set) print("Nu-Support Vector Classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_set)) * 100) # new code # Train NearestCentroid (aka Rocchio classifier) without threshold Nearest_Centroid_classifier = SklearnClassifier(NearestCentroid()) Nearest_Centroid_classifier.train(train_set) print("Nearest Centroid Classifier accuracy percent:", (nltk.classify.accuracy(Nearest_Centroid_classifier, test_set)) * 100) Ridge_classifier = SklearnClassifier( RidgeClassifier(alpha=0.5, tol=1e-2, solver="sag")) Ridge_classifier.train(train_set) print("Ridge Classifier accuracy percent:", (nltk.classify.accuracy(Ridge_classifier, test_set)) * 100) Perceptron_classifier = SklearnClassifier(Perceptron(max_iter=1000)) Perceptron_classifier.train(train_set) print("Perceptron Classifier accuracy percent:", (nltk.classify.accuracy(Perceptron_classifier, test_set)) * 100) Passive_Aggressive_classifier = SklearnClassifier( PassiveAggressiveClassifier(max_iter=1000)) Passive_Aggressive_classifier.train(train_set) print("Passive-Aggressive Classifier accuracy percent:", (nltk.classify.accuracy(Passive_Aggressive_classifier, test_set)) * 100) kNN_classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=10)) kNN_classifier.train(train_set) print("kNN Classifier accuracy percent:", (nltk.classify.accuracy(kNN_classifier, test_set)) * 100) voted_classifier = VoteClassifier(classifier, MNB_classifier, BNB_classifier, LG_classifier, SGD_classifier2, LinearSVC_classifier2, NuSVC_classifier) print("Voted Classifier Classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, test_set)) * 100) print("Classification: ", voted_classifier.classify(test_set[0][0]), "Confidence: %", voted_classifier.confidence(test_set[0][0]) * 100) print("Classification: ", voted_classifier.classify(test_set[2][0]), "Confidence: %", voted_classifier.confidence(test_set[2][0]) * 100) print("Classification: ", voted_classifier.classify(test_set[3][0]), "Confidence: %", voted_classifier.confidence(test_set[3][0]) * 100) print("Classification: ", voted_classifier.classify(test_set[4][0]), "Confidence: %", voted_classifier.confidence(test_set[4][0]) * 100)