def train( self, observations , k=5 ): ''' An ensamble K-Fold Classifier ''' self.forest = [] splitdata = np.array_split(observations, k) combos = list(reversed(list(itertools.combinations(splitdata, k-1)))) accuracy_sum = 0 for i in range(k): train = list(itertools.chain(*combos[i])) test = splitdata[i] if k==1: train = observations test = observations c = SklearnClassifier(RandomForestClassifier()) #c = SklearnClassifier(cls) c.train(train) accuracy_sum += nltk.classify.accuracy(c,test) self.forest.append(c) print('Accuracy on Train data(Using K fold)= ', accuracy_sum/k )
class SentimentMNB(SentimentClassifier): # Sub class constructor def __init__(self, chiK=3368): # Call the super class constructor which initializes the classifier self.chiK = chiK super(SentimentMNB, self).__init__() # End func return return # End wrapper class constructor # Function to initialize the classifier pipeline def initPipeline(self): # Pipeline of transformers with a final estimator # The pipeline class behaves like a compound classifier # pipeline(steps=[...]) # Old MNB pipeline with TFIDF # self.pipeline = Pipeline([('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k=1000)), # ('nb', MultinomialNB())]) self.pipeline = Pipeline([('chi2', SelectKBest(chi2, k=self.chiK)), ('nb', MultinomialNB())]) # End func return return # End initPipeline # Overriding func to train multinomial NB classifier def trainClassifier(self): self.initPipeline() # Create the multinomial NB classifier self.classifier = SklearnClassifier(self.pipeline) # Train the classifier self.classifier.train(self.trainingSet) # End func return return # End trainClassifier override # End sub class
class SVCModel(SKLearnModel): """This model classifies tweets into any one of twenty classes using SVM classification. """ def __init__(self, kernel: str = "") -> None: # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. self.tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize # Here we create the pipeline for the classifier. # The TfidfTransformer is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. # The SVC sets up a Support Vector Machine classifier with the configured kernel. # In this case it is either a linear or a radial basis function kernel. # The details for the above items are discussed in the model's readme. pipeline = Pipeline([('tfidf', TfidfTransformer()), ('{}svc'.format(kernel), SVC(kernel=kernel))]) self.classif = SklearnClassifier(pipeline) def train(self, tweets: List[Tweet]) -> None: def tweet_to_tuple(x): return (FreqDist(self.tokenizer(x.text)), x.emoji) # Generates tuples of all the tweets to form the corpus corpus = map(tweet_to_tuple, tweets) # Train this model! self.classif.train(corpus) def predict(self, text): return self.classif.classify(FreqDist(self.tokenizer(text))) def tokenize(self, text): return self.tokenizer(text)
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' print(trainfeats) classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats)
def bag_of_words_model(df, column_name, target='label', k=1000): """ """ pos_array = df[(df[target] == 1)][column_name].values neg_array = df[(df[target] == 0)][column_name].values pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=k)), ('nb', MultinomialNB())]) clf = SklearnClassifier(pipeline) pos = [FreqDist(word_list) for word_list in pos_array] neg = [FreqDist(word_list) for word_list in neg_array] add_label = lambda lst, lab: [(x, lab) for x in lst] trained_clf = clf.train(add_label(pos, 1) + add_label(neg, 0)) return trained_clf
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['pos'], testsets['pos']) pos_recall = recall(refsets['pos'], testsets['pos']) pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) neg_precision = precision(refsets['neg'], testsets['neg']) neg_recall = recall(refsets['neg'], testsets['neg']) neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = int(len(trainfeats) / n) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = precision(refsets['pos'], testsets['pos']) cv_pos_recall = recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = precision(refsets['neg'], testsets['neg']) cv_neg_recall = recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) print('')
print len(per[0]), len(per[1]), len(per[2]), len(per[3]), len(per[4]), train1 = (9*len(per[0]))/10 train2 = (9*len(per[1]))/10 train3 = (9*len(per[2]))/10 train4 = (9*len(per[3]))/10 train5 = (9*len(per[4]))/10 ones = [FreqDist(x) for x in per[0]] twos = [FreqDist(x) for x in per[1]] threes = [FreqDist(x) for x in per[2]] fours = [FreqDist(x) for x in per[3]] fives = [FreqDist(x) for x in per[4]] print "Starting to train" classif.train(add_label(ones[:train1], '1') + add_label(twos[:train2], '2') + add_label(threes[:train3], '3') + add_label(fours[:train4], '4') + add_label(fives[:train5], '5')) print "Done learning" l_ones = np.array(classif.batch_classify(ones[train1:])) print "one done" l_twos = np.array(classif.batch_classify(twos[train2:])) print "two done" l_threes = np.array(classif.batch_classify(threes[train3:])) print "three done" l_fours = np.array(classif.batch_classify(fours[train4:])) print "four done" l_fives = np.array(classif.batch_classify(fives[train5:])) print "five done" con_ma = [[(l_ones == '1').sum(), (l_ones == '2').sum(), (l_ones == '3').sum(), (l_ones == '4').sum(), (l_ones == '5').sum()], [(l_twos == '1').sum(), (l_twos == '2').sum(), (l_twos == '3').sum(), (l_twos == '4').sum(), (l_twos == '5').sum()], [(l_threes == '1').sum(), (l_threes == '2').sum(), (l_threes == '3').sum(), (l_threes == '4').sum(), (l_threes == '5').sum()],
lines[i].remove('$') #print(lines[i]) positive_vocab = [] negative_vocab = [] neutral_vocab = [] for l in lines: k = float(l[2]) if(k > 0.2): positive_vocab.append(l[0]) elif(k < -0.1): negative_vocab.append(l[0]) else: neutral_vocab.append(l[0]) # print(neutral_vocab) # print(positive_vocab) positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab] negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab] neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab] train_set = positive_features + negative_features + neutral_features random.shuffle(train_set) cls = SklearnClassifier(SGDClassifier()) classifier = cls.train(train_set) sent = '☺😅ðŸ˜' w = word_tokenize(sent) print(len(sent))
def find_feature(document): words = set(document) feature = {} for w in words_feature: feature[w] = (w is words) return feature features = [(find_feature(rev), category) for (rev, category) in documents] testing_set = features[1900:] training_set = features[:1900] if not os.path.isfile(naivebayes): classifier = nltk.NaiveBayesClassifier.train(training_set) save_classifier = open(naivebayes, "wb") pickle.dump(classifier, save_classifier) save_classifier.close() else: classifier_f = open(naivebayes, "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("Original Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100)) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("Multinomial Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100))
from sklearn.linear_model import LogisticRegression,SGDClassifier from sklearn.svm import SVC, LinearSVC testing_set=x_test training_set=x_train from nltk import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(x_train) print(nltk.classify.accuracy(classifier, x_test)) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) classifier.show_most_informative_features(15) Random_Forest_Classifier=SklearnClassifier(RandomForestClassifier()) Random_Forest_Classifier.train(training_set) # Random_Forest_Classifier_Normal=RandomForestClassifier() # Random_Forest_Classifier_Normal.fit(x_train) print("Random Forest Classifier After Ontology Matching percent:", (nltk.classify.accuracy(Random_Forest_Classifier, testing_set))*100) print("Random Forest Classifier :", (nltk.classify.accuracy(Random_Forest_Classifier, testing_set))*100) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classifier accuracy After Ontology Matching percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier accuracy After Ontology Matching percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
out = out + l[c] return out neudata = resample(neudata, n_samples=len(negdata)) posdata = resample(posdata, n_samples=len(negdata)) negfeats = [(word_feats(f), 'neg') for f in word_split(negdata)] posfeats = [(word_feats(f), 'pos') for f in word_split(posdata)] neufeats = [(word_feats(f), 'neu') for f in word_split(neudata)] alldata = negdata + posdata + neudata allfeats = negfeats + posfeats + neufeats classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(allfeats) es = Elasticsearch(['http://localhost:9200/']) doc = { "query": { "bool": { "must_not": [ { "exists": { "field": "likes" } }, { "exists": { "field": "replies" }
#randomly shuffle the features random.shuffle(features) #splitting into training and testing sets train_set = features[:5000] test_set = features[10000:] #print len(train_set),len(test_set) import nltk nltk_nb_classifier = nltk.NaiveBayesClassifier.train(train_set) print "NLTK NB classifier score : ", nltk.classify.accuracy( nltk_nb_classifier, test_set) * 100.0 mnb_classifier = SklearnClassifier(MultinomialNB()) mnb_classifier.train(train_set) print "mnb_classfier score : ", nltk.classify.accuracy(mnb_classifier, test_set) * 100.0 bnb_classifier = SklearnClassifier(BernoulliNB()) bnb_classifier.train(train_set) print "bnb_classfier score : ", nltk.classify.accuracy(bnb_classifier, test_set) * 100.0 svc = SklearnClassifier(SVC(kernel='rbf')) svc.train(train_set) print "SVC : ", nltk.classify.accuracy(svc, test_set) * 100.0 lin_svc = SklearnClassifier(LinearSVC()) lin_svc.train(train_set) print "Linear SCV : ", nltk.classify.accuracy(lin_svc, test_set) * 100.0
mec = nltk.classify.MaxentClassifier.train(train_features, 'GIS', trace=0, max_iter=1000) from sklearn import cross_validation cv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = nltk.NaiveBayesClassifier.train( train_features[traincv[0]:traincv[len(traincv) - 1]]) print 'accuracy: %.3f' % nltk.classify.util.accuracy( classifier, train_features[evalcv[0]:evalcv[len(evalcv) - 1]]) import sklearn from sklearn.svm import LinearSVC from nltk.classify.scikitlearn import SklearnClassifier from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=2000)), ('nb', MultinomialNB())]) pipecl = SklearnClassifier(pipeline) pipecl.train(train_features)
def evaluate_classifier(featx): negfeats = [(featx(f), 'negative') for f in splitter(negative)] posfeats = [(featx(f), 'positive') for f in splitter(positive)] neautralfeats = [(featx(f), 'neautral') for f in splitter(neautral)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) neautcutoff = int(len(neautralfeats) * 3 / 4) trainfeats = negfeats[: negcutoff] + posfeats[: poscutoff] + neautralfeats[: neautcutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neautralfeats[ neautcutoff:] # Max Entropy and SVM classifiers classifier_list = ['maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['positive'], testsets['positive']) if pos_precision is None: pos_precision = 0.0 pos_recall = recall(refsets['positive'], testsets['positive']) if pos_recall is None: pos_recall = 0.0 pos_fmeasure = f_measure(refsets['positive'], testsets['positive']) if pos_fmeasure is None: pos_fmeasure = 0.0 neut_precision = precision(refsets['neautral'], testsets['neautral']) if neut_precision is None: neut_precision = 0.0 neut_recall = recall(refsets['neautral'], testsets['neautral']) if neut_recall is None: neut_recall = 0.0 neut_fmeasure = f_measure(refsets['neautral'], testsets['neautral']) if neut_fmeasure is None: neut_fmeasure = 0.0 neg_precision = precision(refsets['negative'], testsets['negative']) if neg_precision is None: neg_precision = 0.0 neg_recall = recall(refsets['negative'], testsets['negative']) if neg_recall is None: neg_recall = 0.0 neg_fmeasure = f_measure(refsets['negative'], testsets['negative']) if neg_fmeasure is None: neg_fmeasure = 0.0 print('\n') print(classifierName) print('accuracy:', accuracy) acrcy.append(accuracy) print('precision', (pos_precision + neg_precision + neut_precision) / 3) prcsn.append((pos_precision + neg_precision + neut_precision) / 3) print('recall', (pos_recall + neg_recall + neut_recall) / 3) rcall.append((pos_recall + neg_recall + neut_recall) / 3) print('f-measure', (pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3) fmsr.append((pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3)
def trainClassifier(trainData): classifier = SklearnClassifier(LinearSVC()) result = classifier.train(trainData) return result
testingData = featuresContainer[int(TRAIN_TEST_RATIO * (NUMBER_OF_POS_AND_NEG_COMMENTS * 2)):] trainingData = featuresContainer[:int(TRAIN_TEST_RATIO * (NUMBER_OF_POS_AND_NEG_COMMENTS * 2))] # train naive bayes classifier naiveBayesClassifier = nltk.classify.NaiveBayesClassifier.train(trainingData) # print Naive Bayes accuracy print("Naive Bayes accuracy in percent:", (nltk.classify.util.accuracy(naiveBayesClassifier, testingData)) * 100) # save trained naive bayes classifier classifier_to_save = open("naiveBayes.pickle", "wb") pickle.dump(naiveBayesClassifier, classifier_to_save) classifier_to_save.close() # train multinomial naive bayes classifier multinomial_naive_bayes_classifier = SklearnClassifier(MultinomialNB()) multinomial_naive_bayes_classifier.train(trainingData) # print multinomial Naive Bayes classifier accuracy print("multinomial naive bayes accuracy in percent:", (nltk.classify.util.accuracy(multinomial_naive_bayes_classifier, testingData)) * 100) # save trained multinomial naive bayes classifier classifier_to_save = open("multiNaiveBayes.pickle", "wb") pickle.dump(multinomial_naive_bayes_classifier, classifier_to_save) classifier_to_save.close() # train Bernoulli naive bayes classifier bernoulli_naive_bayes_classifier = SklearnClassifier(BernoulliNB()) bernoulli_naive_bayes_classifier.train(trainingData) # print Bernoulli Naive Bayes classifier accuracy print("Bernoulli naive bayes accuracy in percent", (nltk.classify.util.accuracy(bernoulli_naive_bayes_classifier, testingData)) * 100)
rating_names = [student['name'] for student in ratings] data_names = list(set([student['Name'] for student in data])) #cleans text for classifying for i,student in enumerate(data): text = tech.cleanse(student['Student Comment']) data[i]['Student Comment'] = text #split into testing and training sets n = len(data) test_idx = random.sample(xrange(n),int(n*0.5)) train_idx = set(xrange(n))-set(test_idx) test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx])) train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx])) #classifier = NaiveBayesClassifier.train(train_set) classif.train(test_set) #Compute accuracy test_data,test_label = zip(*test_set) train_data,train_label = zip(*train_set) predictions = classif.classify_many(test_data) print confusion_matrix(test_label,predictions) print matthews_corrcoef(test_label,predictions) ''' #Only work if using built-in NLTK classifier print ('Accuracy: {0:.2f}%'.format(100 * nltk.classify.accuracy(classif, test_set))) classif.show_most_informative_features(20) '''
def train(self, features_label): svm = SklearnClassifier(SVC(C=10.0, gamma=0.0001)) self._classifier = svm.train(features_label) return None
import scipy from nltk.classify import maxent nltk.classify.MaxentClassifier.ALGORITHMS # ['GIS','IIS','CG','BFGS','Powell','LBFGSB','Nelder-Mead','MEGAM','TADM'] # MEGAM or TADM are not rec'd for text classification mec = nltk.classify.MaxentClassifier.train(train_features, 'GIS', trace=0, max_iter=1000) from sklearn import cross_validation cv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = nltk.NaiveBayesClassifier.train(train_features[traincv[0]:traincv[len(traincv)-1]]) print 'accuracy: %.3f' % nltk.classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[len(evalcv)-1]]) import sklearn from sklearn.svm import LinearSVC from nltk.classify.scikitlearn import SklearnClassifier from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=2000)), ('nb', MultinomialNB())]) pipecl = SklearnClassifier(pipeline) pipecl.train(train_features)
return features featureset = [(find_features(rev), category) for (rev, category) in dataset] training_set = featureset[:1900] testing_set = featureset[1900:] classifier_f = open("naive_bayes.pickle", "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("Inbuilt Naive Bayes accuracy = ", (nltk.classify.accuracy(classifier, testing_set)) * 100) # classifier.show_most_informative_features() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MN Naive Bayes accuracy = ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100) BE_classifier = SklearnClassifier(BernoulliNB()) BE_classifier.train(training_set) print("BE Naive Bayes accuracy = ", (nltk.classify.accuracy(BE_classifier, testing_set)) * 100) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression accuracy = ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100) SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
class MyClassifier: def __init__(self, load_clf=False, load_tr_data=False): self.features = self.__load_support_vector_features() self.training_data = [] self.n_samples = 0 self.all_tweets = self.__load_tweets_from_file() # list not dict # Classifier loading if load_clf: self.load_clf() else: self.clf = SklearnClassifier(SVC(), sparse=False) # Training Data loading if load_tr_data: self.__load_training_data() def __load_tweets_from_file(self): # open latest file list_of_files = glob.glob( "datasets_twitter/twitter_training_data_raw*.txt") latest_file = max(list_of_files, key=os.path.getctime) f = open(latest_file, "r", encoding="UTF-8") tweet_list = [] for line in f: line = line.split("%\t%") tweet_text, tweet_id = line[0], line[1] tweet_list.append((tweet_text, tweet_id)) return tweet_list def __load_support_vector_features(self): feature_f = open("verifiability_features.txt", "r") # get all features support_vector_features = [] for line_f in feature_f: support_vector_features.append(line_f.replace("\n", "")) feature_f.close() return support_vector_features def __get_sample(self, text_str): """ Changes the text_str into a sample of data in the form of [0, 0, 0, ...] This is to be used by the classifier, when 1) Assembling Training Data, and 2) Testing data. It returns a list of int, which is basically a count of how many of each feature existed in text_str. :param text_str: a string of text which is to be verified :return: curr_sample, a list of int, sort of mapped to self.features """ tokens = pos_tag(word_tokenize(text_str)) curr_sample = [0] * len( self.features) # list of n_features of 0s ex. [0, 0, 0, ..] for token in tokens: # for each feature t_text, t_feature = token[0], token[1] try: for index in range(len(self.features)): if t_feature == self.features[index]: # when found, increment/decrement sample vector's value if self.features[index] == self.features[-1]: # checking if there is a "?" in the text if token[0] == "?": # decrement curr_sample[index] -= 1 break else: curr_sample[index] += 1 break except IndexError: # if the feature isn't in the sv_features list pass return curr_sample def __get_training_target(self, sample): """ Returns the label depending on the sample given. :param sample: int[] from self.__get_sample() :return: "VER" or "NVER", representing the two labels Verifiable and Non-Verifiable """ # check sample if VER or NVER t_sum = 0 for v in sample: if v < 0: # if there exists a "?" in the sample text # (this is the only reason why there'd be a -ve value in curr_sv) t_sum = -1 break t_sum += v if t_sum > 0: return "VER" else: return "NVER" def __assemble_training_data(self): """ Construct the training data using the twitter training data set. To be used directly prior to training the Classifier :return: """ for tweet in self.all_tweets: # get the sample and target for each tweet tweet_text = tweet[0] curr_sample = self.__get_sample(tweet_text) curr_target = self.__get_training_target(curr_sample) # change the above into training data tr_dict = {} for i in range(len(self.features)): tr_dict[self.features[i]] = curr_sample[i] tup = (tr_dict, curr_target) # add to self.training_data self.training_data.append(tup) # repeat def __save_training_data(self): timestamp = '{:%Y_%m_%d_%H_%M_%S}'.format(datetime.datetime.now()) f = open( "datasets_twitter/twitter_training_dataset" + timestamp + ".json", "w+") json_data = json.dumps(self.training_data) f.write(json_data) f.close() def __load_training_data(self): list_of_files = glob.glob( "datasets_twitter/twitter_training_dataset*.json") latest_file = max(list_of_files, key=os.path.getctime) f = open(latest_file, "r") s = f.readline() js = json.loads(s) for i in js: tup = (i[0], i[1]) # sample, target self.training_data.append(tup) def train_with_svc(self): # make the training data self.__assemble_training_data() # Train the classifier self.clf.train(self.training_data) # save classifier as soon as it is trained self.save_clf() def predict_single(self, test_text): """ Predict a single sample. Then based on user's input, add the sample to the training data with the correct label. :param test_text: :return: """ test_sample = self.__get_sample(test_text) test_dict = {} for index in range(len(self.features)): test_dict[self.features[index]] = test_sample[index] pred = self.clf.classify_many([test_dict]) return (pred[0], test_sample) def predict_multiple(self, test_list): """ Predict more than one sample at a time. :param test_list: :return: """ # translate test_list into clf passable data format test_data = [] for i in test_list: curr_test_sample = self.__get_sample(i) test_dict = {} for index in range(len(self.features)): test_dict[self.features[index]] = curr_test_sample[index] test_data.append(test_dict) # predict pred = self.clf.classify_many(test_data) return pred def update_pred_into_training(self, test_tweet, pred_val): """ Adds predicted ( {feat:sample}, target ) to training data then saves the training data if test_text already exists in the training data update the target value instead then save the training data :param test_tweet: a tweet in the form of (tweet_text, tweet_id) :param pred_val: the value of the prediction made by the classifier :return: """ # a flag to make sure only one part of the code is run updated = False # localise test_tweet_text = test_tweet[0] # if text exists in training data already, update the target for this tweet for i in range(len(self.all_tweets)): tweet = self.all_tweets[i] if test_tweet_text == tweet[0]: # if found test_sample = self.__get_sample(test_tweet_text) # make into trainable data format test_dict = {} for j in range(len(self.features)): test_dict[self.features[j]] = test_sample[j] test_target = pred_val tup = (test_dict, test_target) # get the current tup for the test_text and replace self.training_data[i] = tup # there should only be one tweet with the same text updated = True break # if test_text is not in the training data already if not updated: # make into trainable data format test_sample = self.__get_sample(test_tweet_text) test_dict = {} for j in range(len(self.features)): test_dict[self.features[j]] = test_sample[j] test_target = pred_val tup = (test_dict, test_target) # add tweet to all_tweets and training data # get tweet_id self.all_tweets.append(test_tweet) self.training_data.append(tup) # consistency updated = True # save the training data to file self.__save_training_data() # train the classifier again self.train_with_svc() def load_clf(self): """ Load a previously trained and saved classifier. :return: """ self.clf = joblib.load("twitterClassifier.pkl") def save_clf(self): """ Save the current classifier to file :return: """ joblib.dump(self.clf, "twitterClassifier.pkl")
shuffle(neg_tweets_set) test_set = pos_tweets_set[:2500] + neg_tweets_set[:2500] train_set = pos_tweets_set[2500:] + neg_tweets_set[2500:] #train_set = pos_tweets_set + neg_tweets_set ME_classifier = MaxentClassifier.train(train_set, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) NB_classifier = NaiveBayesClassifier.train(train_set) SVM_classifier = SklearnClassifier(LinearSVC(), sparse=False) SVM_classifier.train(train_set) #ME_accuracy = classify.accuracy(ME_classifier, test_set) #NB_accuracy = classify.accuracy(NB_classifier, test_set) #SVM_accuracy = classify.accuracy(SVM_classifier, test_set) #print(ME_accuracy, NB_accuracy, SVM_accuracy) actual_set = defaultdict(set) predicted_set = defaultdict(set) for index, (feature, actual_label) in enumerate(test_set): actual_set[actual_label].add(index) predicted_label = NB_classifier.classify(feature) predicted_set[predicted_label].add(index) accuracy = classify.accuracy(NB_classifier, test_set)
def word_feats(words): return dict([(word, True) for word in words]) def create_word_features(words): useful_words = [word for word in words if word not in stopwords.words("english")] my_dict = dict([(word, True) for word in useful_words]) return my_dict positive_features = [(word_feats(pos), 'pos') for pos in pos_vocab] negative_features = [(word_feats(neg), 'neg') for neg in neg_vocab] train_set = negative_features + positive_features LRclassifier = SklearnClassifier(LogisticRegression()) LRclassifier.train(train_set) def pre(text): text = word_tokenize(''.join(text).lower()) neg = 0 pos = 0 for word in text: classResult = LRclassifier.classify( word_feats(word)) if classResult == 'neg': neg = neg + 1 if classResult == 'pos': pos = pos + 1 outdict = {'pos': str(float(pos)/len(text)), 'neg' : str(float(neg)/len(text))} return outdict
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'Reading Tweets\n' tweets_data_path = '20161019_202620.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue tweets = pd.DataFrame() tweets['text'] = [tweet.get('text','') for tweet in tweets_data] tdata = tweets['text'] negfeats = [(featx(f), 'neg') for f in word_split(tdata)] testfeats = negfeats print np.shape(testfeats) #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] #print np.shape(testfeats) # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print testsets[observed] accuracy = nltk.classify.util.accuracy(classifier, testfeats) #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos']) #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos']) #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos']) #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg']) #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg']) #neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg']) print '' print '---------------------------------------' print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', accuracy
def evaluate_mult_classifiers(feature_x, n_folds=5): # 5-fold default for cross-validation # train_feats = 75% of pos_data + 75% of neg_data # test_feats = 25% of pos_data + 25% of neg_data neg_feats = [(feature_x(i), 'neg') for i in word_split(neg_data)] pos_feats = [(feature_x(i), 'pos') for i in word_split(pos_data)] neg_cutoff = int(len(neg_feats) * 0.75) pos_cutoff = int(len(pos_feats) * 0.75) train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff] test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:] classifier_list = ['NB', 'SVM'] ## CROSS VALIDATION train_feats = neg_feats + pos_feats # Shuffle training set random.shuffle(train_feats) for cl in classifier_list: subset_size = int(len(train_feats) / n_folds) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 print('--------------------------') print('Beginning Cross-validation') print('--------------------------') for i in range(n_folds): testing_this_round = train_feats[i * subset_size:][:subset_size] training_this_round = train_feats[:i * subset_size] + train_feats[ (i + 1) * subset_size:] if cl == 'NB': classifierName = 'Naive Bayes' # Using NLTK NaiveBayesClassifier classifier = NaiveBayesClassifier.train(training_this_round) else: classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) ref_sets = collections.defaultdict(set) test_sets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): ref_sets[label].add(i) observed = classifier.classify(feats) test_sets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(ref_sets['pos'], test_sets['pos']) cv_pos_recall = nltk.recall(ref_sets['pos'], test_sets['pos']) cv_pos_fmeasure = nltk.f_measure(ref_sets['pos'], test_sets['pos']) cv_neg_precision = nltk.precision(ref_sets['neg'], test_sets['neg']) cv_neg_recall = nltk.recall(ref_sets['neg'], test_sets['neg']) cv_neg_fmeasure = nltk.f_measure(ref_sets['neg'], test_sets['neg']) print('Fold: {} Acc : {:.4F}'.format(cv_count, cv_accuracy)) print('Fold: {} pos_prec : {:.4F} neg_prec : {:.4F}'.format( cv_count, cv_pos_precision, cv_neg_precision)) print('Fold: {} pos_recall: {:.4F} neg_recall: {:.4F}'.format( cv_count, cv_pos_recall, cv_neg_recall)) print('Fold: {} pos_fmeas : {:.4F} neg_fmeas : {:.4F}'.format( cv_count, cv_pos_fmeasure, cv_neg_fmeasure)) print('--') accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('----------------------------------------------------------') print('{}-Fold Cross Validation results for {} Classifier'.format( n_folds, classifierName)) print('----------------------------------------------------------') print('accuracy : {:.4F}'.format(sum(accuracy) / n_folds)) print('precision: {:.4F}'.format( (sum(pos_precision) / n_folds + sum(neg_precision) / n_folds) / 2)) print('recall : {:.4F}'.format( (sum(pos_recall) / n_folds + sum(neg_recall) / n_folds) / 2)) print('f-measure: {:.4F}'.format( (sum(pos_fmeasure) / n_folds + sum(neg_fmeasure) / n_folds) / 2)) print('\n')
def runClassifiers(positives, negatives, featuresToUse, outFile, verbose, classifiersToUse): onDataSet = 0 numDataSets = len(positives + negatives) table = [] pos = [] neg = [] short = NUM_CLASSIFIERS - len(classifiersToUse) for x in range(short): classifiersToUse.append(False) # print which features we are using print("Using these features: ", FeatureExtractor.featuresToString(featuresToUse)) for data in positives: pos.append((FeatureExtractor.langFeatures(data, featuresToUse), True)) onDataSet += 1 for data in negatives: neg.append((FeatureExtractor.langFeatures(data, featuresToUse), False)) onDataSet += 1 random.shuffle(pos) random.shuffle(neg) # Testing is 1/4 of the data set, so we will cut it off there minLen = min(len(pos), len(neg)) posCut = minLen//4 negCut = posCut*2 # splits training and test sets train_data = pos[posCut:] + neg[negCut:] test_data = pos[:posCut] + neg[:posCut] maxEntSupport = featuresToUse["max_ent"] if classifiersToUse[0]: print("Running Naive Bayes classifier") timeStart = time.time() # NLTK's built-in implementation of the Naive Bayes classifier is trained classifier = nltk.NaiveBayesClassifier.train(train_data) # attempt to use sklearn naive bayes, not as good unfortunately # clf = MultinomialNB() # if featuresToUse["words"] or featuresToUse["ngrams"]: # pipeline = Pipeline([ # ('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k='all')), # ('NB', clf)]) # classifier = SklearnClassifier(pipeline) # else: # classifier = SklearnClassifier(clf) # classifier.train(train_data) # get the time it takes to train Naive Bayes print ("\nTime to train in seconds: ", time.time() - timeStart) # if featuresToUse["laugh_count"]: # DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier) # else: # DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Naive Bayes", maxEntSupport)) if verbose: # this is a nice function that reports the top most impactful features the NB classifier found print("\n\n") print (classifier.show_most_informative_features(20)) if classifiersToUse[1]: print("Running Decision Tree classifier") timeStart = time.time() # NLTK's built-in implementation of the Decision Tree classifier is trained classifier = nltk.DecisionTreeClassifier.train(train_data) # get the time to train Decision tree print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Decision Tree")) if verbose: print("Printing tree") # print(classifier.pretty_format()) for (feats, cor) in test_data[:20]: classification = classifier.classify(feats) print("Correct: ", cor, " Result: ", classification)#, "for ", feats[0]) if classifiersToUse[2]: print("Running Maximum Entropy classifier") timeStart = time.time() # NLTK's built-in implementation of the Max Entropy classifier is trained classifier = nltk.MaxentClassifier.train(train_data, max_iter=25) if featuresToUse["laugh_count"]: DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier) else: DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier) # get the time to train Maximum Entropy print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Maximum Entropy")) if verbose: # this is a nice function that reports the top most impactful features the NB classifier found print (classifier.show_most_informative_features(20)) # this is a function that explains the effect of each feature in the set # print (classifier.explain()) if classifiersToUse[3]: print("Running SVM classifier") timeStart = time.time() # Scikit-learn's LinearSVC classifier, wrapped up in NLTK's wrapper class clf = LinearSVC() if featuresToUse["Dim Reduction"]: # pipeline = Pipeline([ # ('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k='all')), # ('randomforest', clf)]) pipeline = Pipeline([('PCA', PCA()), ('classifier', clf)]) classifier = SklearnClassifier(pipeline) else: classifier = SklearnClassifier(clf) classifier.train(train_data) # get the time to train a Support Vector Machine print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Linear SVC")) if classifiersToUse[4]: numEstimators = 50 print("Running AdaBoost classifier") timeStart = time.time() # Scikit-learn's AdaBoost classifier wrapped up in NLTK's wrapper class # The main parameters to tune to obtain good results are: # n_estimators and the complexity of the base estimators # testclf = RandomForestClassifier() # clf = AdaBoostClassifier(base_estimator=testclf, n_estimators=numEstimators) clf = AdaBoostClassifier(n_estimators=numEstimators) if featuresToUse["Dim Reduction"]: pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)]) classifier = SklearnClassifier(pipeline) else: classifier = SklearnClassifier(clf) classifier.train(train_data) # get the time to train print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "AdaBoost(" + str(numEstimators) + ")", maxEntSupport)) if classifiersToUse[5]: print("Running Random Forest Classifier classifier") timeStart = time.time() # Scikit-learn's Random Forest classifier wrapped up in NLTK's # wrapper class # The main parameters to tune to obtain good results are: # n_estimators clf = RandomForestClassifier() if featuresToUse["Dim Reduction"]: # pipeline = Pipeline([ # ('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k='all')), # ('randomforest', clf)]) pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)]) classifier = SklearnClassifier(pipeline) else: classifier = SklearnClassifier(clf) classifier.train(train_data) # get the time to train print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table table.append(assess_classifier(classifier, test_data, "Random Forest", maxEntSupport)) if classifiersToUse[6]: numEstimators = 50 print("Running Combo classifier") timeStart = time.time() adaclf = SklearnClassifier(AdaBoostClassifier(n_estimators=numEstimators)) adaclf.train(train_data) naive = nltk.NaiveBayesClassifier.train(train_data) # get the time to train print ("\nTime to train in seconds: ", time.time() - timeStart) # store the accuracy in the table TP = TN = FP = FN = 0 for i, (feats, label) in enumerate(test_data): observed = False if naive.classify(feats) and adaclf.classify(feats): observed = True if label == observed: if observed: TP += 1 else: TN += 1 else: if observed: FP += 1 else: FN += 1 accuracy = (TP+TN)/(TP+FP+TN+FN) p_prec = TP/(TP+FP) p_rec = TP/(TP+FN) f1Pos = 2*((p_prec*p_rec)/(p_prec + p_rec)) n_prec = TN/(TN+FN) n_rec = TN/(TN+FP) f1Neg = 2*((n_prec*n_rec)/(n_prec + n_rec)) table.append(["COMBO", accuracy, p_prec, p_rec, f1Pos, n_prec, n_rec, f1Neg]) if (outFile == ""): print("\n", FeatureExtractor.featuresToString(featuresToUse)) # print(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"])) else: with open(outFile, 'a') as out: out.write("\n") out.write(FeatureExtractor.featuresToString(featuresToUse)) out.write(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"])) out.write("\n") return table
def evaluate_classifier(featx, balance=False): global negdata global neudata global posdata if balance: neudata = resample(neudata, n_samples=len(negdata)) posdata = resample(posdata, n_samples=len(negdata)) # using 3 classifiers classifier_list = ['svm', 'nb', 'maxent'] negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] neufeats = [(featx(f), 'neu') for f in word_split(neudata)] alldata = negdata + posdata + neudata allfeats = negfeats + posfeats + neufeats #10-fold cross-validation correct = [] incorrect = [] for n in [10]: #range(2,6): negfeatssplit = chunkIt(negfeats, n) negdatasplit = chunkIt(negdata, n) posfeatssplit = chunkIt(posfeats, n) posdatasplit = chunkIt(posdata, n) neufeatssplit = chunkIt(neufeats, n) neudatasplit = chunkIt(neudata, n) for cl in classifier_list: accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] neu_precision = [] neu_recall = [] pos_fmeasure = [] neg_fmeasure = [] neu_fmeasure = [] cv_count = 1 res = {} res["neg"] = 0 res["pos"] = 0 res["neu"] = 0 for i in range(n): testing_this_round = negfeatssplit[i - 1] + posfeatssplit[ i - 1] + neufeatssplit[i - 1] training_this_round = gettrainfeat( negfeatssplit, i) + gettrainfeat( posfeatssplit, i) + gettrainfeat(neufeatssplit, i) if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train( training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) aux_test = {} auxFP_test = {} aux_test['pos'] = 0 aux_test['neu'] = 0 aux_test['neg'] = 0 auxFP_test['pos'] = 0 auxFP_test['neu'] = 0 auxFP_test['neg'] = 0 for ii, (feats, label) in enumerate(testing_this_round): refsets[label].add(ii) observed = classifier.classify(feats) testsets[observed].add(ii) res[observed] = res[observed] + 1 auxFP_test[observed] = auxFP_test[observed] + 1 if (observed == label): correct.append((feats, label)) aux_test[label] = aux_test[label] + 1 else: incorrect.append((feats, label)) cv_accuracy = nltk.classify.util.accuracy( classifier, testing_this_round) cv_neg_precision = float(aux_test['neg']) / float( len(negfeatssplit[i - 1])) print cv_neg_precision cv_neg_recall = float(aux_test['neg']) / float( auxFP_test['neg']) cv_neg_fmeasure = 2 * ((cv_neg_precision * cv_neg_recall) / (cv_neg_precision + cv_neg_recall)) cv_pos_precision = float(aux_test['pos']) / float( len(posfeatssplit[i - 1])) cv_pos_recall = float(aux_test['pos']) / float( auxFP_test['pos']) cv_pos_fmeasure = 2 * ((cv_pos_precision * cv_pos_recall) / (cv_pos_precision + cv_pos_recall)) cv_neu_precision = float(aux_test['neu']) / float( len(neufeatssplit[i - 1])) cv_neu_recall = float(aux_test['neu']) / float( auxFP_test['neu']) cv_neu_fmeasure = 2 * ((cv_neu_precision * cv_neu_recall) / (cv_neu_precision + cv_neu_recall)) #cv_accuracy = float(aux_test['neg'] + aux_test['pos']+ aux_test['neu'])/float(len(testing_this_round)) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) neg_precision.append(cv_neg_precision) neu_precision.append(cv_neu_precision) pos_recall.append(cv_pos_recall) neg_recall.append(cv_neg_recall) neu_recall.append(cv_neu_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) neu_fmeasure.append(cv_neu_fmeasure) cv_count += 1 print "Balance = ", balance print '---------------------------------------' print str( n ) + '-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')' print "Nbr = ", res print 'accuracy:', sum(accuracy) / n print 'precision', ((sum(pos_precision) / n) + (sum(neg_precision) / n) + (sum(neu_precision) / n)) / 3.0 print sum(pos_precision) / n, sum(neg_precision) / n, sum( neu_precision) / n print 'recall', (sum(pos_recall) / n + sum(neg_recall) / n + sum(neu_recall) / n) / 3.0 print sum(pos_recall) / n, sum(neg_recall) / n, sum(neu_recall) / n print 'f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n + sum(neu_fmeasure) / n) / 3.0 print sum(pos_fmeasure) / n, sum(neg_fmeasure) / n, sum( neu_fmeasure) / n print "*********CORRECT****" print(len(correct), len(incorrect)) #print (correct,incorrect) for tt in correct: print(tt[1], alldata[allfeats.index(tt)]) print "***INCORRECT**********" for tt in incorrect: print(tt[1], alldata[allfeats.index(tt)]) #.index(correct[0])) print "..."
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] #print(negfeats) negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) #print(negcutoff) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #print(trainfeats) testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats) #classifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) #print(testfeats) for i, (feats, label) in enumerate(testfeats): #feats : list of words #label : neg/pos #observed : neg/pos #print(feats,'---',label) refsets[label].add(i) observed = classifier.classify(feats) #print(observed) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) neg_precision = nltk.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation subset_size = int(len(trainfeats) / n) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) print('')
def evaluate_classifier(featx): #negfeats = [(featx(mark_negation(f)), 'neg') for f in word_split(negdata)] #posfeats = [(featx(mark_negation(f)), 'pos') for f in word_split(posdata)] negfeats = [(featx(f), 'neg') for f in word_split(negdata)] #print negfeats[1:25] #raw_input('>') posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] print "No of training reviews:", len(trainfeats) #print trainfeats testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print "No of testing reviews:", len(testfeats) # using 3 classifiers classifier_list = ['nb', 'svm', 'maxent'] # NB_pred = [] new_label = [] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) original_label = [] for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) original_label.append(label) #print feats #raw_input('> ') observed = classifier.classify(feats) NB_pred.append(observed) testsets[observed].add(i) #print refsets['pos'] #print testsets['pos'] #print original_label #print NB_Pred #cm = confusion_matrix(original_label,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(original_label,NB_pred)) new_label = original_label accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) neg_precision = nltk.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features(50) print('') #print len(NB_pred) ME_pred = NB_pred[982:] SVM_pred = NB_pred[491:982] NB_pred = NB_pred[0:491] #print NB_pred #print "-----------------------" #print ME_pred #print "-----------------------" #print SVM_pred #print "-----------------------" #cm = confusion_matrix(SVM_pred,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,NB_pred)) #cm = confusion_matrix(ME_pred,NB_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(ME_pred,NB_pred)) #cm = confusion_matrix(SVM_pred,ME_pred) #print cm #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,ME_pred)) final_pred = [] for i in range(0, 491): c1 = 0 if NB_pred[i] == 'pos': c1 = c1 + 1 if ME_pred[i] == 'pos': c1 = c1 + 1 if SVM_pred[i] == 'pos': c1 = c1 + 1 #print i if c1 == 3 or c1 == 2: final_pred.append('pos') else: final_pred.append('neg') print "-----------------------" #print final_pred print "-----------------------" #print new_label print "Results of ensemble: NB + SVM + ME::" print "----------Confusion Matrix--------------" cm = confusion_matrix(final_pred, new_label) print cm print "" print "The accuracy score of ensemble is {:.2%}".format( accuracy_score(final_pred, new_label)) print "##############################################" ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = len(trainfeats) / n accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) if cl == 'maxent': maxent_accuracy_next = (sum(accuracy) / n) maxent_accuracy.append(maxent_accuracy_next) elif cl == 'svm': svm_accuracy_next = (sum(accuracy) / n) svm_accuracy.append(svm_accuracy_next) else: nb_accuracy_next = (sum(accuracy) / n) nb_accuracy.append(nb_accuracy_next)
nonlinear_svm = SklearnClassifier(SVC(gamma='scale', kernel='poly', coef0 = 5.0, degree = 5, C = 5.0, shrinking=True, probability=False, tol=1e-3), sparse=False).train(train_set) print("Accuracy - Nonlinear SVM: ") print(nltk.classify.accuracy(nonlinear_svm, test_set)) random_forest = SklearnClassifier(RandomForestClassifier(n_estimators = 100, criterion = 'gini', max_depth = 5, min_samples_split = 2, min_samples_leaf = 1, min_weight_fraction_leaf = 0.0, max_features = 25, max_leaf_nodes = 20, min_impurity_decrease = 0.0, bootstrap = True, oob_score = False, random_state = None ), sparse = False) random_forest.train(train_set) print("Accuracy - Random Forest Classifier: ") print(nltk.classify.accuracy(random_forest, test_set)) test_tweet = "75% of illegal Aliens commit Felons such as ID, SSN and Welfare Theft Illegal #Immigration is not a Victimless Crime !" # print(naive_bayes.classify(extract_features_of_tweet(test_tweet, raw=True))) # print(maxent.classify(extract_features_of_tweet(test_tweet, raw=True))) print(linear_svm_classifier.classify(extract_features_of_tweet(test_tweet, raw=False))) print(nonlinear_svm.classify(extract_features_of_tweet(test_tweet, raw=True)))
class MyClassifier: def __init__(self): self.features = self.get_support_vector_features() self.training_data = [] self.n_samples = 0 self.tweets_from_file = self.__get_tweets_from_file() self.clf = None def __get_tweets_from_file(self): # open latest file list_of_files = glob.glob( "datasets_twitter/twitter_training_data_set*.txt") latest_file = max(list_of_files, key=os.path.getctime) f = open(latest_file, "r") tweet_list = [] for line in f: line = line.split("%\t%") tweet_text, tweet_id = line[0], line[1] tweet_list.append(tweet_text, tweet_id) return tweet_list def __get_support_vector_features(self): feature_f = open("verifiability_features.txt", "r") # get all features support_vector_features = [] for line_f in feature_f: support_vector_features.append(line_f.replace("\n", "")) feature_f.close() return support_vector_features def __get_sample(self, text_str): """ Changes the text_str into a sample of data in the form of [0, 0, 0, ...] This is to be used by the classifier, when 1) Assembling Training Data, and 2) Testing data. It returns a list of int, which is basically a count of how many of each feature existed in text_str. :param text_str: a string of text which is to be verified :return: curr_sample, a list of int, sort of mapped to self.features """ tokens = pos_tag(word_tokenize(text_str)) curr_sample = [0] * len( self.features) # list of n_features of 0s ex. [0, 0, 0, ..] for token in tokens: # for each feature t_text, t_feature = token[0], token[1] try: for index in range(len(self.features)): if t_feature == self.features[index]: # when found, increment/decrement sample vector's value if self.features[index] == self.features[-1]: # checking if there is a "?" in the text if token[0] == "?": # decrement curr_sample[index] -= 1 break else: curr_sample[index] += 1 break except IndexError: # if the feature isn't in the sv_features list pass return curr_sample def __get_training_target(self, sample): """ Returns the label depending on the sample given. :param sample: int[] from self.__get_sample() :return: "VER" or "NVER", representing the two labels Verifiable and Non-Verifiable """ # check sample if VER or NVER t_sum = 0 for v in sample: if v < 0: # if there exists a "?" in the sample text # (this is the only reason why there'd be a -ve value in curr_sv) t_sum = -1 break t_sum += v if t_sum > 0: return "VER" else: return "NVER" def assemble_training_data(self): pass def train_with_SVC(self): self.clf = SklearnClassifier(SVC(), sparse=False) self.clf.train(self.training_data) def predict_single(self, test_text): test_sample = self.__get_sample(test_text) test_dict = {} for index in range(self.features): test_dict[self.features[index]] = test_sample[index] pred = self.clf.predict([test_dict]) print("Prediction:", pred) feedback = input("Is this prediction correct? Y/N") # Make data + target into a tuple if feedback == "Y" or feedback == "y": tup = (test_dict, pred) else: # correct the target and make into a tuple if pred == "VER": tup = (test_dict, "NVER") else: tup = (test_dict, "VER") # add tuple to training data def predict_multiple(self, test_list): test_data = [] for i in test_list: curr_test_sample = self.__get_sample(i) test_dict = {} for index in range(self.features): test_dict[self.features[index]] = curr_test_sample[index] test_data.append(test_dict) pred = self.clf.predict([test_data]) print("Prediction:", pred) feedback = input("Are these predictions correct? Y/N") # Make data + target into a tuple if feedback == "Y" or feedback == "y": # get individual tuples for i in range(len(test_data)): tup = (test_data[i], pred[i]) # Add to training data else: # must correct test data manually before adding into training data print( "Please predict each separately to add samples into training dataset." )
## split the training sets into training and validation sets training_set, validation_set = train_test_split(features_sets, test_size=0.2) ###create all the classifier we gonna use classifier = nltk.NaiveBayesClassifier.train(training_set) print "Original Naive Bayes Algo accuracy:", nltk.classify.accuracy(classifier, validation_set)*100 classifier.show_most_informative_features(15) ## save the classifier save_classifier = open("pickled_algos/naivebayes5k.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) # print "MNB_classifier accuracy:", nltk.classify.accuracy(MNB_classifier, testing_set)*100 # MNB_classifier.show_most_informative_features(15) ##show which features are most distinctive save_classifier = open("pickled_algos/MNB_5k.pickle", "wb") pickle.dump(MNB_classifier, save_classifier) save_classifier.close() BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) # print "BernoulliNB_classifier accuracy:", nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100 save_classifier = open("pickled_algos/BernoulliNB_5k.pickle", "wb") pickle.dump(BernoulliNB_classifier, save_classifier) save_classifier.close() # LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
def train(self, features_label): svm = SklearnClassifier(SVC(C=1000.0, gamma=0.0001)) self._classifier = svm.train(features_label) return None
random.shuffle(rawData) testData = rawData[:size] trainData = rawData[size:] random.shuffle(trainData) # Generate TermFrequency for each doc trainTF = [(FreqDist(tokenize(text)), tag) for text, tag in trainData] testTF = [(FreqDist(tokenize(text)), tag) for text, tag in testData] # Create classifier pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) # Train classifier classif.train(trainTF) # Evaluate testTags = [tag for tf, tag in testTF] testResults = classif.batch_classify([tf for tf, tag in testTF]) right = 0 for i, tg in enumerate(testTags): if testResults[i] == tg: right += 1 print 'Results: ------------------------------------' print testResults print 'Accuracy:', right / float(len(testTags)) print '---------------------------------------------'
def evaluate_classifier(data): trainfeats, testfeats = train_test_split(data, test_size=0.3, random_state=0) # using 3 classifiers classifier_list = ['nb','svm'] classifier_dict ={'nb':'Naive Bayes', 'svm':'SVM'} for cl in classifier_list: classifierPkl = os.path.join('pkl',cl+".pkl") if not os.path.exists('./%s'%classifierPkl): if cl == 'svm': classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifier = NaiveBayesClassifier.train(trainfeats) pickle.dump(classifier,open(classifierPkl, 'wb')) else: classifier = pickle.load(open(classifierPkl,'rb')) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['positive'], testsets['positive']) pos_recall = recall(refsets['positive'], testsets['positive']) pos_fmeasure = f_measure(refsets['positive'], testsets['positive']) neg_precision = precision(refsets['negative'], testsets['negative']) neg_recall = recall(refsets['negative'], testsets['negative']) neg_fmeasure = f_measure(refsets['negative'], testsets['negative']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifier_dict[cl] + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = len(trainfeats) / n accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i*int(subset_size):][:int(subset_size)] training_this_round = trainfeats[:i*int(subset_size)] + trainfeats[(i+1)*int(subset_size):] classifierPkl = os.path.join('pkl',cl+"_cv.pkl") if not os.path.exists('./%s'%classifierPkl): if cl == 'svm': classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifier = NaiveBayesClassifier.train(training_this_round) pickle.dump(classifier,open(classifierPkl, 'wb')) else: classifier = pickle.load(open(classifierPkl,'rb')) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = precision(refsets['positive'], testsets['positive']) cv_pos_recall = recall(refsets['positive'], testsets['positive']) cv_pos_fmeasure = f_measure(refsets['positive'], testsets['positive']) cv_neg_precision = precision(refsets['negative'], testsets['negative']) cv_neg_recall = recall(refsets['negative'], testsets['negative']) cv_neg_fmeasure = f_measure(refsets['negative'], testsets['negative']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifier_dict[cl] + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2) print('recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2) print('f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2) print('')
# https://pythonprogramming.net/sklearn-scikit-learn-nltk-tutorial/ from sklearn.linear_model import LogisticRegression import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from sklearn.model_selection import train_test_split from nltk.classify import SklearnClassifier data = pd.read_csv('reviews.csv') print(data[['text', 'sentiment']]) data = data[['text', 'sentiment']] training_set, testing_set = train_test_split(data, test_size=0.1) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)
print "Accuracy :" accuracy = nltk.classify.util.accuracy(classifier, test_set) print(accuracy * 100) print "Building model : Naive Bayes" classifier = NaiveBayesClassifier.train(train_set) import pickle f = open('naive_classifier.pickle', 'wb') pickle.dump(classifier, f) f.close() print "Accuracy :" accuracy = nltk.classify.util.accuracy(classifier, test_set) print(accuracy * 100) print "Building model : SVM" classifier = SklearnClassifier(LinearSVC(), sparse=True) classifier.train(train_set) import pickle f = open('svm_classifier.pickle', 'wb') pickle.dump(classifier, f) f.close() print "Accuracy :" accuracy = nltk.classify.util.accuracy(classifier, test_set) print(accuracy * 100) print "model saved"
for f in word_split(posdata)] negcutoff = int(len(negfeats) * 10 / 11) poscutoff = int(len(posfeats) * 10 / 11) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] ######################################################################################## ######################################################################################## refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) str = 'SINGLE FOLD RESULT ' + '(' + 'linear-svc' + ')' #training with LinearSVC classifier = SklearnClassifier(LinearSVC()) classifier.train(trainfeats) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) * 100 pos_precision = nltk.precision(refsets['pos'], testsets['pos']) pos_recall = recall(refsets['pos'], testsets['pos']) pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) neg_precision = precision(refsets['neg'], testsets['neg']) neg_recall = recall(refsets['neg'], testsets['neg']) neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print(str)
# just break on gaps -- note that this doesn't filter out punctuation tokenizer = RegexpTokenizer('[\w\d]+') training_set = [] with open('train_jlm.csv', 'rb') as f: reader = csv.reader(f) for row in reader: if row[0] != 'OrganisationId': # header words = tokenizer.tokenize(row[1]) if row[4] == 'Academic': training_set.append((words, 'academic')) else: training_set.append((words, 'private')) classif.train([(FreqDist(words), label) for (words, label) in training_set]) training_set_classification = classif.batch_classify( [FreqDist(words) for (words, label) in training_set]) print "training set score:", sum([training_set[i][1] == training_set_classification[i] for i in range(len(training_set))]), '/', len(training_set) training_set_prob_classification = classif.batch_prob_classify( [FreqDist(words) for (words, label) in training_set]) full_set = [] with open('organisations.csv', 'rb') as f: reader = csv.reader(f) for row in reader: if row[0] != 'OrganisationId': # header organisation_ids = row[0] words = tokenizer.tokenize(row[1])