def set_classifier(chosen_classifier, train_set, sentence): classifier = SklearnClassifier(chosen_classifier) classifier.train(train_set) neg = 0 pos = 0 # print('Classifier:', str(chosen_classifier)) for word in sentence: classResult = classifier.classify(word_feats(word)) # print(word_feats(word)) # print(classResult) if classResult == 'neg': neg = neg + 1 if classResult == 'pos': pos = pos + 1 posPercent = str(float(pos) / len(sentence)) negPercent = str(float(neg) / len(sentence)) # print ('Accuracy:', nltk.classify.util.accuracy(classifier, sentence)) # classifier.show_most_informative_features() # print('Score:', score) # print('Positive: ' + posPercent) # print('Negative: ' + negPercent) # print('Pos', pos) # print('Neg', neg) return posPercent, negPercent, pos, neg
def getSubjObj(self, text): words = Text(text.split(" ")) bigrams = self.getBigrams(words) subjclassifier = self.loadSOClsssifier() posnegclassifier = self.loadPNClsssifier() subj_or_obj = SklearnClassifier.classify(subjclassifier, bigrams) if subj_or_obj == "objective": return "neutral" pos_or_neg = SklearnClassifier.classify(posnegclassifier, bigrams) if pos_or_neg == "negative": return "negative" else: return "positive"
def evaluate(classifier_alo): classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口 classifier.train(trainFeatures) #训练分类器 referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) i = 0 for item in testFeatures: referenceSets[item[1]].add(i) predicted = classifier.classify(item[0]) testSets[predicted].add(i) i += 1 pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos']) pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos']) neg_pre = nltk.metrics.precision(referenceSets['neg'], testSets['neg']) neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg']) print (str('{0:.3f}'.format(float(pos_pre))) + " " +str('{0:.3f}'.format(float(pos_recall))) + " " +str('{0:.3f}'.format(float(neg_pre))) + " " +str( '{0:.3f}'.format(float(neg_recall))) + " " +str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + " " +str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
class SKClassifier: classifier = None def __init__(self, cls='SVC'): self.classifier = SklearnClassifier({ 'SVC': SVC(), 'LogisticRegression': LogisticRegression(), 'BernoulliNB': BernoulliNB() }[cls]) if not self.classifier: self.classifier = SklearnClassifier(SVC()) def train(self, trainset): self.classifier.train(trainset) def test(self, tagged, featuresets): predict = self.classifier.classify_many(featuresets) print predict return accuracy_score(tagged, predict) def classify(self, featureset): return self.classifier.classify(featureset) def classify_many(self, featuresets): return self.classifier.classify_many(featuresets)
def classification(value): trainingSet = pd.read_csv("ner_dataset.csv") dataSet = pd.read_csv("ner_test.csv") print("Done it") featureSet = obtain_training_set(trainingSet) testSet = obtain_testset(dataSet) #classifier = nltk.DecisionTreeClassifier.train(featureSet[:5000]) #accuracy1 = nltk.classify.accuracy(classifier, featureSet[size:]) #print("Accuracy of Decision Tree classifier: ", accuracy1) #secondClassifier = nltk.NaiveBayesClassifier.train(featureSet[:5000]) #accuracy2 = nltk.classify.accuracy(secondClassifier, featureSet[size:]) #print("Accuracy of Naive Bayes Classifier: ", accuracy2) #thirdClassifier = SklearnClassifier(KNeighborsClassifier()).train(featureSet[:5000]) #accuracy3 = nltk.classify.accuracy(thirdClassifier, featureSet[size:]) #print("Accuracy of K-neighbour classifier", accuracy3) # information = CountVectorizer(analyzer=obtain_features(trainingSet, value), lowercase=False) # information.fit_transform(trainingSet['Word']).toarray() firstClassifier = SklearnClassifier(SGDClassifier()).train(featureSet) accuracy4 = nltk.classify.accuracy(firstClassifier, featureSet) print("Accuracy of linear model", accuracy4) solution = [] for index in range(0, len(dataSet['Word'])): predictTag = str(firstClassifier.classify(testSet[index])) solution.append((dataSet['Word'][index], predictTag)) return 0, 0, 0, accuracy4, solution
def randomforests(num_folds, featuresets, label_list): subset_size = int(len(featuresets) / num_folds) # overall gold labels for each instance (reference) and predicted labels (test) reflist = [] testlist = [] accuracy_list = [] print("Random Forests Classifier") # iterate over the folds for i in range(num_folds): print('Start Fold', i) test_this_round = featuresets[i * subset_size:][:subset_size] train_this_round = featuresets[:i * subset_size] + featuresets[ (i + 1) * subset_size:] # train using train_this_round classifier = SklearnClassifier(RandomForestClassifier()) classifier.train(train_this_round) # evaluate against test_this_round and save accuracy accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round) print(i, accuracy_this_round) accuracy_list.append(accuracy_this_round) # add the gold labels and predicted labels for this round to the overall lists for (features, label) in test_this_round: reflist.append(label) testlist.append(classifier.classify(features)) print('Done with cross-validation') # call the evaluation measures function print('mean accuracy-', sum(accuracy_list) / num_folds) (precision_list, recall_list) = eval_measures(reflist, testlist, label_list) print_evaluation(precision_list, recall_list, label_list) print(" ")
def main(): posts = nltk.corpus.nps_chat.xml_posts() print(len(posts)) print(sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys())) featuresets = [] prev_ = None for post in posts: featuresets.append((dialogue_act_features(post.text, prev_), post.get('class'))) if post.get('class') != 'Statement': prev_ = post.get('class') size = int(len(featuresets) * 0.01) train_set, test_set = featuresets[size:], featuresets[:size] # Linear Support vector classification classif = SklearnClassifier(LinearSVC()) classif.train(train_set) # Logistic Regression method # classif = SklearnClassifier(LogisticRegression()) # classif.train(train_set) dialog_Act_A = [] print("Accuracy : ", nltk.classify.accuracy(classif, test_set) * 100) classAprev = None book = xlwt.Workbook() sh1 = book.add_sheet('Group A') index = 0 openFile = open("output.txt", "a", encoding='utf-8') with open('test-inputs.txt', 'r', encoding='utf-8') as groupA: for text in groupA: class_ = classif.classify(dialogue_act_features(text, classAprev)) classAprev = class_ if class_ != 'Statement': classAprev = class_ if class_.find('Question') != -1: class_ = "1" else: class_ = "0" openFile.write(text.rstrip() + ", " + class_ + "\n") sh1.write(index, 0, text) sh1.write(index, 1, classAprev) index = index + 1 dialog_Act_A.append(class_) groupA.close() book.save('QuestionAnalysis.xls')
def mnb_classifier(dataset): label_feats = label_feats_from_data(dataset, bag_of_non_stopwords) train_feats, test_feats = train_test_split(label_feats, train_size=0.7, test_size=0.3) mnb_classify = SklearnClassifier(MultinomialNB()) mnb_classify.train(train_feats) result = mnb_classify.classify(test_feats) generate_report(result, 'bow_mnb', class_list)
def create_bnb_classifier(trainingset, testingset): x = 0 y = 0 print("\nBernoulli Naive Bayes classifier is being trained and created...") BNB_classifier = SklearnClassifier(BernoulliNB()) BNB_classifier.train(trainingset) for t in testingset: y = y + 1 l = BNB_classifier.classify(t[0]) if (l == t[1]): x = x + 1 accuracy = x / y * 100 print("BernoulliNB accuracy percent = " + str(accuracy)) return BNB_classifier
def create_logistic_regression_classifier(trainingset, testingset): x = 0 y = 0 print("\nLogistic Regression classifier is being trained and created...") LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(trainingset) for t in testingset: y = y + 1 l = LogisticRegression_classifier.classify(t[0]) if (l == t[1]): x = x + 1 accuracy = x / y * 100 print("Logistic Regression classifier accuracy = " + str(accuracy)) return LogisticRegression_classifier
def handle(self, *args, **options): trains = get_train_tweets() if not trains: raise CommandError('No train data, please add some from the admin page!') train_count = trains.count() train_set = generate_trainset(trains) nb_classifier = nltk.NaiveBayesClassifier.train(train_set) sci_classifier = SklearnClassifier(LinearSVC()) sci_classifier.train(train_set) while True: unclassified_tweets = Tweet.objects.filter(train=False, klass=None) total_count = unclassified_tweets.count() if total_count > 0: print('Classifying %d tweets...' % total_count) counts_nb = defaultdict(int) counts_svm = defaultdict(int) start_time = time.time() for tweet in unclassified_tweets: feature_vect = get_feature_vector(process_tweet(tweet.body)) features = extract_features(feature_vect) sentiment_nb = nb_classifier.classify(features) sentiment_svm = sci_classifier.classify(features) counts_nb[sentiment_nb] += 1 counts_svm[sentiment_svm] += 1 tweet.klass = sentiment_nb tweet.klass_svm = sentiment_svm msg_nb = ['%d %s' % (counts_nb[k], v) for k, v in Tweet.CLASSES] msg_svm = ['%d %s' % (counts_svm[k], v) for k, v in Tweet.CLASSES] print('\rNB: ' + ', '.join(msg_nb) + ';\tSVM: ' + ', '.join(msg_svm), end='') # print('\r' + ', '.join(msg_nb), end='') tweet.save() if settings.DEBUG: db.reset_queries() elapsed = int(time.time() - start_time) print('\nClassifying finished in %d seconds.' % elapsed) new_trains = get_train_tweets() if new_trains.count() != train_count: print('Train set has been changed, retraining...') trains = new_trains train_count = new_trains.count() train_set = generate_trainset(trains) nb_classifier = nltk.NaiveBayesClassifier.train(train_set) sci_classifier = SklearnClassifier(LinearSVC()) sci_classifier.train(train_set) else: print('Waiting...') time.sleep(3)
def create_mnb_classifier(trainingset, testingset): x = 0 y = 0 print( "\nMultinomial Naive Bayes classifier is being trained and created...") MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(trainingset) for t in testingset: y = y + 1 l = MNB_classifier.classify(t[0]) if (l == t[1]): x = x + 1 accuracy = x / y * 100 print("MultinomialNB Classifier accuracy = " + str(accuracy)) return MNB_classifier
class Bernoulli: def __init__(self): self.classifier = None self.word_features = None def train(self, listaTweets, listaTweets2): selected_tweets = listaTweets rejected_tweets = listaTweets2 self.word_features = self.features(selected_tweets, rejected_tweets) training_set = self.get_training_set(selected_tweets, rejected_tweets) self.classifier = SklearnClassifier(BernoulliNB()) self.classifier.train(training_set) def features(self, selected_tweets, rejected_tweets): selected_tweets = np.array(selected_tweets, dtype=object) selected_tweets = np.hstack(selected_tweets.flat) rejected_tweets = np.array(rejected_tweets, dtype=object) rejected_tweets = np.hstack(rejected_tweets.flat) wordlist1 = nltk.FreqDist(selected_tweets) wordlist2 = nltk.FreqDist(rejected_tweets) word_features1, v = zip(*wordlist1.most_common()) word_features2, g = zip(*wordlist2.most_common()) return word_features1 + word_features2 def extract_features(self, tweet): if self.word_features is not None: tweet_words = set(tweet) features = {} for word in self.word_features: features['contains(%s)' % word] = (word in tweet_words) return features else: print("Bernoulli must be trained before classifying") sys.exit(1) def get_training_set(self, selected_tweets, rejected_tweets): training_set = [] for tweet in selected_tweets: training_set.append((self.extract_features(tweet), "selected")) for tweet in rejected_tweets: training_set.append((self.extract_features(tweet), "rejected")) return training_set def classify(self, inputs): if self.classifier is not None: return self.classifier.classify(self.extract_features(inputs))
def train_Classifier(self, posfeats, negfeats, index): """The training set percentage should be passed as an argument. """ # divide dataset into train and validation sets posCutoff = int(math.floor(len(posfeats) * 7 / 10)) negCutoff = int(math.floor(len(negfeats) * 7 / 10)) trainFeatures = posfeats[:posCutoff] + negfeats[:negCutoff] testFeatures = posfeats[posCutoff:] + negfeats[negCutoff:] referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) classsifiername = '' if (index == 0): classifier = nltk.classify.maxent.MaxentClassifier.train( trainFeatures, 'GIS', trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=5) classsifiername = 'Maximum Entropy' elif (index == 1): classifier = SklearnClassifier(BernoulliNB()) classifier.train(trainFeatures) classsifiername = 'Bernoulli Naive Bayes' else: classifier = SklearnClassifier(LogisticRegression()) classifier.train(trainFeatures) classsifiername = 'LogisticRegression' for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) # # print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) # print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) # print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos']) # print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos']) # print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg']) # print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg']) #classifier.show_most_informative_features(10) return classifier
def logistic_classifier(file): file = str(file) logistic_model = SklearnClassifier(LogisticRegression()) # train the model on the training data logistic_model.train(train_data) accuracy = nltk.classify.accuracy(logistic_model, test_data) * 100 print("Logistic Regression Classifier Accuracy: {}".format(accuracy)) # Tag the test file. with open(file, 'r') as fin: for test_sentence in fin: # Tokenize the line. doc = word_tokenize(test_sentence.lower()) featurized_doc = {i: (i in doc) for i in word_features} tagged_label = logistic_model.classify(featurized_doc) results.write(str(tagged_label) + '\n')
def naive_classifier(file): file = str(file) naive_bayes_model = SklearnClassifier(MultinomialNB()) # train the model on the training data naive_bayes_model.train(train_data) accuracy = nltk.classify.accuracy(naive_bayes_model, test_data) * 100 print("Naive Bayes Classifier Accuracy: {}".format(accuracy)) # Tag the test file. with open(file, 'r') as fin: for test_sentence in fin: # Tokenize the line. doc = word_tokenize(test_sentence.lower()) featurized_doc = {i: (i in doc) for i in word_features} tagged_label = naive_bayes_model.classify(featurized_doc) results.write(str(tagged_label) + '\n')
def ImplementBNB(self): print("~~~~~~~~~~~~~~~ BernoulliNB Classifier ~~~~~~~~~~~~~~~\n") # classifier = NaiveBayesClassifier.train(trainFeatures) classifier = SklearnClassifier(BernoulliNB()) classifier.train(trainFeatures) print("BernoulliNB Classifier Training Completed") #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) expected_array = [] predicted_array = [] #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) expected_array.append(label) predicted = classifier.classify(features) predicted_array.append(predicted) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print("BernoulliNB Classifier Test Results ") print("") print("Length of Training Features" + str(len(trainFeatures))) print("Length of Test Features" + str(len(testFeatures))) print('Accuracy:' + str(nltk.classify.util.accuracy(classifier, testFeatures))) print('Positive precision:', str(precision(referenceSets['Positive'], testSets['Positive']))) print('Positive recall:', str(recall(referenceSets['Positive'], testSets['Positive']))) print('Negative precision:', str(precision(referenceSets['Negative'], testSets['Negative']))) print('Negative recall:', str(recall(referenceSets['Negative'], testSets['Negative']))) print("~~~~~~~~~~~~~~~Classification report~~~~~~~~~~~~~~~\n", classification_report(expected_array, predicted_array)) print("~~~~~~~~~~~~~~~Confusion matrix~~~~~~~~~~~~~~~\n", confusion_matrix(expected_array, predicted_array)) print("")
def multinomial_bayes_nltk_wrapper(corpus, documents_training, documents_test, words_features, smoothing, kbest): """ Multinomial Naive Bayes Algorithm using wrapper NLTK SklearnClassifier Memory problems can occur if very large dataset :param corpus: :param documents_training: :param documents_test: :param words_features: :param smoothing: :param kbest: :return: """ print print "----- Multinomial Bayes with wrapper nltk Algorithm------" print "Creating Training Feature Vectors..." array_features_training = [] for (id, original_category, annotations) in documents_training: array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category)) # array_features_training = apply_features(extract_document_features,documents_training) print "Training algorithm..." # ('chi2', SelectKBest(chi2, k=3000)), if kbest == 0: kbest = "all" pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()), ('nb', MultinomialNB(alpha=smoothing))]) # pipeline = Pipeline([('nb', MultinomialNB(alpha=smoothing))]) classifier = SklearnClassifier(pipeline) classifier.train(array_features_training) print "Calculating metrics ..." categories = util_classify.get_categories(corpus) estimated_categories = [] original_categories = [] for (id, cat_original, annotations) in documents_test: cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus))) estimated_categories.append(categories.index(cat_estimated)) original_categories.append(categories.index(cat_original)) return original_categories, estimated_categories
class Classifier: """The Classifier""" ############################################# def train(self, trainfile): """Trains the classifier model on the training set stored in file trainfile""" train = pd.read_csv(trainfile, delimiter='\t', names=[ 'polarity_label', 'aspect_category', 'term', 'char_term_offset', 'sentence' ]) train, feat_list = preprocessor(train) feat_set = nltk_compatible(train, feat_list) split = int(len(feat_set) * 0.75) feat_train = feat_set[:split] feat_test = feat_set[split:] #self.main_classifier = SklearnClassifier(RandomForestClassifier()) #self.main_classifier = SklearnClassifier(MultinomialNB()) #self.main_classifier = SklearnClassifier(BernoulliNB()) #self.main_classifier = SklearnClassifier(LogisticRegression()) self.main_classifier = SklearnClassifier(svm.LinearSVC()) self.main_classifier.train(feat_train) def predict(self, datafile): """Predicts class labels for the input instances in file 'datafile' Returns the list of predicted labels """ devdata = pd.read_csv(datafile, delimiter='\t', names=[ 'polarity_label', 'aspect_category', 'term', 'char_term_offset', 'sentence' ]) devdata, test_feat = preprocessor(devdata) test_set = nltk_compatible(devdata, test_feat) labels = [] for (sentence, label) in test_set: predict = self.main_classifier.classify(sentence) labels.append(predict) return labels
def linear_support_vector_machines_tf_idf(corpus, documents_training, documents_test, words_features, kbest): """ Linear Support Vector Machines Algorithm. The Support Vector Machines algorithm with a linear kernel and using TF/IDF :param corpus: :param documents_training: :param documents_test: :param words_features: :param kbest: :return: """ print print "----- Linear Support Vector Machines with tfidf algorithm ------" print "Creating Features Training Vectors..." categories = util_classify.get_categories(corpus) array_features_training = [] for (id, original_category, annotations) in documents_training: array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category)) print "Training algorithm..." if kbest == 0: kbest = "all" pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()), ('svc', LinearSVC())]) classifier = SklearnClassifier(pipeline) classifier.train(array_features_training) print "Calculating metrics..." estimated_categories = [] original_categories = [] for (id, cat_original, annotations) in documents_test: cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus))) estimated_categories.append(categories.index(cat_estimated)) original_categories.append(categories.index(cat_original)) return original_categories, estimated_categories
def train_Classifier(posfeats,negfeats,index): # divide dataset into train and validation sets posCutoff = int(math.floor(len(posfeats)*7/10)) negCutoff = int(math.floor(len(negfeats)*7/10)) trainFeatures = posfeats[:posCutoff] + negfeats[:negCutoff] testFeatures = posfeats[posCutoff:] + negfeats[negCutoff:] referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) classsifiername='' if (index == 0): classifier = nltk.classify.maxent.MaxentClassifier.train(trainFeatures, 'GIS', trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 5) classsifiername= 'Maximum Entropy' elif (index ==1): classifier = SklearnClassifier(BernoulliNB()) classifier.train(trainFeatures) classsifiername='Bernoulli Naive Bayes' else: classifier = SklearnClassifier(LogisticRegression()) classifier.train(trainFeatures) classsifiername = 'LogisticRegression' for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos']) print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg']) #classifier.show_most_informative_features(10) return classifier
def main(): features = load_features() train_set = features[9500:] test_set = features[:1406] test = processTweet("Just started using @zoho email client on #ios and must admit that it's much better than @gmail from @Google.Better #UI, #UX and faster sync") test2 = processTweet("What the hell, @firefox and @Apple? Implement damn date\/time inputs. Chrome has supported for 5 years, Opera for 8.\u2026 https:\/\/t.co\/ZiyAQH8sBt") test3 = processTweet("Lovely @google celebration of Iraqi architect Zaha Hadid today https:\/\/t.co\/FrsJUt3RF5 via @\/google.com\/doodles") test4 = processTweet("#Apple pay usage peaked in March 2015. Adoption rate is declining. One of the major concerns: security (despite Apple Pay being very secure)") global word_features word_features = get_word_features(get_words_in_tweets(train_set)) training_set = nltk.classify.apply_features(extract_features, train_set) classifier = SklearnClassifier(MultinomialNB()) classifier.train(training_set) #print classifier.show_most_informative_features(40) print classifier.classify(extract_features(test)) print classifier.classify(extract_features(test2)) print classifier.classify(extract_features(test3)) print classifier.classify(extract_features(test4)) testing_set = nltk.classify.apply_features(extract_features, test_set) print("MNB_classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
class Swinger(object): """docstring for Swinger""" BASEDIR = os.path.dirname(__file__) classifier_table = { 'SVC': SVC(probability=False), 'LinearSVC': LinearSVC(), 'NuSVC': NuSVC(probability=False), 'MultinomialNB': MultinomialNB(), 'BernoulliNB': BernoulliNB(), 'LogisticRegression': LogisticRegression() } def __init__(self): self.train = [] self.test = [] self.classifier = '' def load(self, model, useDefault=True, pos=None, neg=None, BestFeatureVec=700): BestFeatureVec = int(BestFeatureVec) if useDefault: print('load default bestMainFeatures') self.bestMainFeatures = pickle.load( open( os.path.join( self.BASEDIR, 'bestMainFeatures.pickle.{}'.format(BestFeatureVec)), 'rb')) print('load default bestMainFeatures success!!') self.classifier = pickle.load( open( os.path.join(self.BASEDIR, '{}.pickle.{}'.format(model, BestFeatureVec)), 'rb')) print("load model from {}".format(model)) else: try: print('load local bestMainFeatures') self.bestMainFeatures = pickle.load( open('bestMainFeatures.pickle.{}'.format(BestFeatureVec), 'rb')) print('load local bestMainFeatures success!!') self.classifier = pickle.load( open('{}.pickle.{}'.format(model, BestFeatureVec), 'rb')) print("load model from {}".format(model)) except Exception as e: # build best features. print( 'load bestMainFeatures failed!!\nstart creating bestMainFeatures ...' ) self.pos_origin = json.load(open(pos, 'r')) self.neg_origin = json.load(open(neg, 'r')) shuffle(self.pos_origin) shuffle(self.neg_origin) poslen = len(self.pos_origin) neglen = len(self.neg_origin) # build train and test data. self.pos_review = self.pos_origin[:int(poslen * 0.9)] self.pos_test = self.pos_origin[int(poslen * 0.9):] self.neg_review = self.neg_origin[:int(neglen * 0.9)] self.neg_test = self.neg_origin[int(neglen * 0.9):] self.bestMainFeatures = create_Mainfeatures( pos_data=self.pos_review, neg_data=self.neg_review, BestFeatureVec=BestFeatureVec) # 使用詞和雙詞搭配作為特徵 # build model print('start building {} model!!!'.format(model)) self.classifier = SklearnClassifier( self.classifier_table[model]) #nltk在sklearn的接口 if len(self.train) == 0: print('build training data') posFeatures = self.emotion_features( self.best_Mainfeatures, self.pos_review, 'pos') negFeatures = self.emotion_features( self.best_Mainfeatures, self.neg_review, 'neg') self.train = posFeatures + negFeatures self.classifier.train(self.train) #訓練分類器 pickle.dump( self.classifier, open('{}.pickle.{}'.format(model, BestFeatureVec), 'wb')) def buildTestData(self, pos_test, neg_test): pos_test = json.load(open(pos_test, 'r')) neg_test = json.load(open(neg_test, 'r')) posFeatures = self.emotion_features(self.best_Mainfeatures, pos_test, 'pos') negFeatures = self.emotion_features(self.best_Mainfeatures, neg_test, 'neg') return posFeatures + negFeatures def best_Mainfeatures(self, word_list): return { word: True for word in word_list if word in self.bestMainFeatures } def score(self, pos_test, neg_test): from sklearn.metrics import precision_recall_curve from sklearn.metrics import roc_curve from sklearn.metrics import auc # build test data set if len(self.test) == 0: # self.test = self.buildTestData(self.pos_test, self.neg_test) self.test = self.buildTestData(pos_test, neg_test) test, test_tag = zip(*self.test) pred = list( map(lambda x: 1 if x == 'pos' else 0, self.classifier.classify_many(test))) #對開發測試集的數據進行分類,給出預測的標籤 tag = list(map(lambda x: 1 if x == 'pos' else 0, test_tag)) # ROC AUC fpr, tpr, _ = roc_curve(tag, pred, pos_label=1) print("ROC AUC: %.2f" % auc(fpr, tpr)) return auc(fpr, tpr) def emotion_features(self, feature_extraction_method, data, emo): return list(map(lambda x: [feature_extraction_method(x), emo], data)) #爲積極文本賦予"pos" def swing(self, sentence): sentence = self.best_Mainfeatures(CutAndrmStopWords(sentence)) return self.classifier.classify(sentence) def swingList(self, sentenceList): sentence = self.best_Mainfeatures(sentenceList) return self.classifier.classify(sentence)
def SingleFold(train_group, k=8): """Do a single fold of different classifiers For classifiers, I've written my own NaiveBayes Classifier and I also considered several available classifiers in nltk and sklearn like ['Maximum Entropy', 'DecisionTree', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC']. I want to compare performances of these classifiers and ouput their accuracy, precision, recall, F1. Args: train_group: The original training set contains all the news related with the stock and its label. For example: ([[title1],[content1],[title2],[content2],...],'+1') k: Title's weight Returns: It doesn't return things, instead it prints the result. For each classifier, for example: --------------------------------------- SINGLE FOLD RESULT (NaiveBayes) --------------------------------------- accuracy: 0.6479463537300922 precision 0.6505853139411139 recall 0.965771458662454 f-measure 0.7774480712166171 """ print('Preparing...') random.shuffle(train_group) cutoff = int(math.floor(len(train_group) * 3 / 4)) train_set, test_set = PrepareSets(train_group[cutoff:], train_group[:cutoff], k) classifier_list = [ 'NaiveBayes', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC' ] # 'Maximum Entropy', 'DecisionTree' for cl in classifier_list: if cl == 'NaiveBayes': print('Training...') classifier = nltk.NaiveBayesClassifier.train(train_set) # elif cl == 'Maximum Entropy': # print('Training...') # classifier = nltk.MaxentClassifier.train(train_set, 'GIS', trace=0) elif cl == 'BernoulliNB': classifier = SklearnClassifier(BernoulliNB()) print('Training...') classifier.train(train_set) elif cl == 'LogisticRegression': classifier = SklearnClassifier(LogisticRegression()) print('Training...') classifier.train(train_set) elif cl == 'SVC': classifier = SklearnClassifier(LinearSVC()) print('Training...') classifier.train(train_set) elif cl == 'LinearSVC': classifier = SklearnClassifier(LinearSVC()) print('Training...') classifier.train(train_set) else: classifier = SklearnClassifier(NuSVC()) print('Training...') classifier.train(train_set) # else: # print('Training...') # classifier = nltk.DecisionTreeClassifier.train(train_set) # print(classifier.show_most_informative_features(10)) print('Testing...') TP = 0 FN = 0 FP = 0 TN = 0 for i, (feats, label) in enumerate(test_set): observed = classifier.classify(feats) if label == '+1' and observed == '+1': TP += 1 elif label == '-1' and observed == '+1': FP += 1 elif label == '+1' and observed == '-1': FN += 1 elif label == '-1' and observed == '-1': TN += 1 accuracy = (TP + TN) / len(test_set) recall = TP / (TP + FN) precision = TP / (TP + FP) F1 = 2 * precision * recall / (precision + recall) pickle.dump(classifier, open('./' + cl + '.pkl', 'wb')) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + cl + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', precision) print('recall', recall) print('f-measure', F1)
classifier.train(train_set) # classifier = NaiveBayesClassifier.train(train_set) # print ('accuracy:', nltk.classify.util.accuracy(classifier, test_set)) # classifier.show_most_informative_features() neg = 0 pos = 0 # sentence = "I feel terrible today." sentence = sentence.lower() print(sentence) words = nltk.word_tokenize(sentence) print(words) for word in words: classResult = classifier.classify(word_feats(word)) print(word_feats(word)) print(classResult) if classResult == 'neg': neg = neg + 1 if classResult == 'pos': pos = pos + 1 print('Positive: ' + str(float(pos) / len(words))) print('Negative: ' + str(float(neg) / len(words))) print('Pos', pos) print('Neg', neg) # print('BernoulliNB`s accuracy is %f' %score(BernoulliNB())) # print('MultinomiaNB`s accuracy is %f' %score(MultinomialNB())) # print('LogisticRegression`s accuracy is %f' %score(LogisticRegression()))
def linear_support_vector_machines_cross_language_tf_idf(corpus_training, corpus_test, documents_training, documents_test, words_features, kbest): """ Cross Language linear Support Vector Machines algorithm. The Support Vector Machines algorithm with a linear kernel. An implementation of linear SVM to conduct cross-language experiments. :param corpus_training: :param corpus_test: :param documents_training: :param documents_test: :param words_features: :return: """ print print "----- Cross-Language Support Vector Machines algorithm------" print "Creating Training Vectors..." categories = util_classify.get_categories(corpus_training) ids_documents_test = [] original_cats = [] array_cats_names = [] array_features_training = [] array_vector_training = [] array_categories = [] for (id, original_category, annotations) in documents_training: array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus_training), original_category)) array_categories.append(util_classify.get_categories(corpus_training).index(original_category)) for x in array_categories: array_cats_names.append(categories[x]) print "Training algorithm..." if kbest == 0: kbest = "all" pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()), ('svc', LinearSVC())]) classifier = SklearnClassifier(pipeline) classifier.train(array_features_training) print "Calculating metrics..." estimated_categories = [] original_categories = [] categories = util_classify.get_categories(corpus_test) for (id, cat_original, annotations) in documents_test: cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus_test))) estimated_categories.append(categories.index(cat_estimated)) original_categories.append(categories.index(cat_original)) ''' categories_names = util_classify.get_categories(corpus_test) array_cats_names = [] for x in estimated_categories: array_cats_names.append(categories_names[x]) # Storage process predicted categories in DB util_classify.set_database_session(corpus_test) for document in Session.query(Document): if document.id in ids_documents_test: pos = ids_documents_test.index(document.id) document.classified_in_category = array_cats_names[pos] Session.commit() # End storage process predicted categories in DB ''' return original_categories, estimated_categories
testing_set = nltk.classify.apply_features(extract_features, test_tweets) for (tweet, sentiment) in test_tweets: print(classifier.classify(extract_features(tweet))) print(nltk.classify.accuracy(classifier, testing_set)) classifier.show_most_informative_features(5) """ pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) """ pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k='all')), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) classif.train(training_set) print(classif.labels()) for (tweet, sentiment) in test_tweets: print(classif.classify(extract_features(tweet))) print(nltk.classify.accuracy(classif, testing_set))
class YoutubeVideoClassifier(Utility): """ Use the collected data as training set and classify test data""" def __init__(self): Utility.__init__(self) self.nb_output_file_name = self.config.get("GLOBAL", "nb_output_file") self.svm_output_file_name = self.config.get("GLOBAL", "svm_output_file") self.nb_output = os.path.join(self.output_dir, self.nb_output_file_name) self.svm_output = os.path.join(self.output_dir, self.svm_output_file_name) self.train_features = [] self.stopwords_set = set(stopwords.words("english")) def run_main(self): self.pre_processing() self.feature_extraction() self.classification() self.testing() def pre_processing(self): self.load_data() def load_data(self): self.load_movies() self.load_actors() self.load_tvshows() self.load_test_data() def load_movies(self): self.movies_list = [] movies_fd = codecs.open(self.movies_file) for movie in movies_fd.readlines(): if not movie: continue self.movies_list.append(movie) movies_fd.close() def load_actors(self): self.actors_list = [] actors_fd = codecs.open(self.actors_file) for actor in actors_fd.readlines(): if not actor: continue self.actors_list.append(actor) actors_fd.close() def load_tvshows(self): self.tvshows_list = [] tvshows_fd = codecs.open(self.tvshows_file) for tvshow in tvshows_fd.readlines(): if not tvshow: continue self.tvshows_list.append(tvshow) tvshows_fd.close() def load_test_data(self): json_data = open(self.test_file) self.test_data = json.load(json_data) def feature_selection(self, features_list): selected_features = [] for feat in features_list: if feat and feat.strip() and feat.lower() not in self.stopwords_set: selected_features.append((feat.strip().lower(), True)) return dict(selected_features) def feature_extraction(self): for item in self.tvshows_list: if not item: continue selected_features = self.feature_selection(item.replace("_", " ").split(" ")) self.train_features.append((selected_features, "tvshow")) for item in self.movies_list: if not item: continue selected_features = self.feature_selection(item.replace("_", " ").split(" ")) self.train_features.append((selected_features, "movie")) for item in self.actors_list: if not item: continue selected_features = self.feature_selection(item.replace("_", " ").split(" ")) self.train_features.append((selected_features, "celebrity")) def classification(self): # Training NB Classifier self.nb_classifier = NaiveBayesClassifier.train(self.train_features) # Training SVM classifier self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(self.train_features) def testing(self): nb_fd = codecs.open(self.nb_output, "w", "utf-8") svm_fd = codecs.open(self.svm_output, "w", "utf-8") for instance in self.test_data: try: if not instance: continue test_features = instance.get("title").split(" ") test_features.extend(instance.get("description").split(" ")) selected_features = self.feature_selection(test_features) label = self.nb_classifier.classify(selected_features) nb_fd.write("%s\n" % (label)) label = self.svm_classifier.classify(selected_features) svm_fd.write("%s\n" % (label)) except: logging.info("Exception in test data ") continue nb_fd.close() svm_fd.close()
neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:] neg_feats_train = get_train_features_from_tweets(neg_train, 'neg') pos_feats_train = get_train_features_from_tweets(pos_train, 'pos') train_feats = neg_feats_train + pos_feats_train svm_classifier = SklearnClassifier(LinearSVC()) svm_classifier.train(train_feats) # Evaluation correct, wrong = 0, 0 for tweet in neg_test: features = get_features_from_tweet(tweet) result = svm_classifier.classify(features) if result == "neg": correct += 1 else: wrong += 1 for tweet in pos_test: features = get_features_from_tweet(tweet) result = svm_classifier.classify(features) if result == "pos": correct += 1 else: wrong += 1 print "Accuracy: {}".format(correct / float(correct + wrong))
def AccuracyByClassifier(classifier_model, pos_wordlist, neg_wordlist, mode='normal', best_topwords=list()): is_network = False # 进行分类 if classifier_model in ('svm', 'SVM'): # 线性核的SVM classifier_model = LinearSVC() elif classifier_model in ('mb', 'MB'): # 多项式朴素贝叶斯 classifier_model = MultinomialNB() elif classifier_model in ('bb', 'BB'): # 伯努利朴素贝叶斯 classifier_model = BernoulliNB() elif classifier_model in ('dt', 'DT'): # 决策树 classifier_model = DecisionTreeClassifier(criterion='entropy') # elif classifier_model in ('gbdt', 'GBDT'): # # 梯度提升决策树GBDT # classifier_model = GradientBoostingClassifier() elif classifier_model in ('nn', 'NN'): # 神经网络 is_network = True # [len(best_topwords), 30, 2], 30, 5, 0.3->82.6%准确率 # Network接口参数说明:神经网络各层神经元个数、迭代次数、分片数据集大小、学习率 classifier_model = Network([len(best_topwords), 30, 2], 30, 10, 0.3, best_topwords) else: # 默认用LR classifier_model = LogisticRegression() classifier = SklearnClassifier(classifier_model) if not is_network else classifier_model tp = fp = tn = fn = 0 if len(mode) == 2 and mode[0]=='k-cross': knum = int(mode[1]) all_wordlist = pos_wordlist + neg_wordlist shuffle(all_wordlist) precision = recall = F_measure = accuracy = 0.0 real = list() pred = list() for i in range(knum): piece_len = int(len(all_wordlist)/knum) train_set = all_wordlist[:piece_len*i] + all_wordlist[piece_len*(i+1):] test_set = all_wordlist[piece_len*i:piece_len*(i+1)] classifier.train(train_set) for each in test_set: pre = classifier.classify(each[0]) real.append(int(each[1])) pred.append(int(pre)) if int(each[1]) == int(pre) and int(each[1]) == 1: tp += 1 elif int(each[1]) == int(pre) and int(each[1]) == 0: tn += 1 elif int(each[1]) != int(pre) and int(each[1]) == 1: fn += 1 elif int(each[1]) != int(pre) and int(each[1]) == 0: fp += 1 cur_precision = float(tp)/(tp+fp) precision += cur_precision cur_recall = float(tp)/(tp+fn) recall += cur_recall F_measure += 2.0/((1/cur_precision) + (1/cur_recall)) accuracy += float(tp + tn) / (tp + fp + tn + fn) # DrawPrecisionRecallCurve(real, pred) return (precision/knum, recall/knum, F_measure/knum, accuracy/knum) elif mode[0]=='normal': # 分出训练集和测试集 pos_len = len(pos_wordlist) neg_len = len(neg_wordlist) # shuffle数据集 shuffle(pos_wordlist) shuffle(neg_wordlist) train_set = pos_wordlist[:int(0.7*pos_len)] + neg_wordlist[:int(0.7*neg_len)] # devtest_set = pos_wordlist[int(0.6*pos_len):int(0.7*pos_len)] + neg_wordlist[int(0.6*neg_len):int(0.7*neg_len)] test_set = pos_wordlist[int(0.7*pos_len):] + neg_wordlist[int(0.7*neg_len):] classifier.train(train_set) real = list() pred = list() for each in test_set: pre = classifier.classify(each[0]) real.append(int(each[1])) pred.append(int(pre)) if int(each[1]) == int(pre) and int(each[1]) == 1: tp += 1 elif int(each[1]) == int(pre) and int(each[1]) == 0: tn += 1 elif int(each[1]) != int(pre) and int(each[1]) == 1: fn += 1 elif int(each[1]) != int(pre) and int(each[1]) == 0: fp += 1 precision = float(tp)/(tp+fp) recall = float(tp)/(tp+fn) F_measure = 2.0/((1/precision) + (1/recall)) accuracy = float(tp + tn) / (tp + fp + tn + fn) # print tp, fp, tn, fn # DrawPrecisionRecallCurve(real, pred) return (precision, recall, F_measure, accuracy, classifier) else: return (0, 0, 0, 0, 0)
print "creating feature sets..." tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/testdata.csv') labeld_features = label_feats_from_tweets(tweetlist) #labeld_features = label_feats_from_corpus(movie_reviews) training_set, test_set = split_label_feats(labeld_features) # tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') # training_set = label_feats_from_tweets(tweetlist) # training_set, garbage = split_label_feats(training_set, 1.0) # test_set, garbage = split_label_feats(labeld_features, 1.0) print "training set length: %i test set length: %i" % (len(training_set), len(test_set)) print prettifyFeatureSet(test_set) print "training classifier..." #classifier = NaiveBayesClassifier.train(training_set) #classifier = MaxentClassifier.train(training_set, algorithm='iis', max_iter=99, min_lldelta=0.01) #classifier = MaxentClassifier.train(training_set) classifier = SklearnClassifier(LogisticRegression()).train(training_set) print "calculating accuracy..." print 'accuracy:', nltk.classify.util.accuracy(classifier, test_set) #classifier.show_most_informative_features(30) negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous']) print classifier.classify(negfeat) probdist = classifier.prob_classify(negfeat) print "pos: ", probdist.prob('pos'), " neg: ", probdist.prob('neg') print classifier.labels() classify_tweet(classifier, "I love this movie!", True) classify_tweet(classifier, "!!!", True)
class TestCorpus(): # static variables common to all instances feature_words = stopwords.words('english') feature_types = {'BOOLEAN':0, 'FREQUENCY':1, 'FREQUENCY_NORMALIZED':2} # default feature type is FREQUENCY_NORMALIZED feature_type = 2 classifier_types = {'NAIVE_BAYES':0, 'SVM_LINEAR':1, 'SVM_POLY':2} # default classifier is polynomial SVM classifier_type = 1 # boolean values (feature word occurs or does not occur in text) @classmethod def features_boolean(cls, text, features=[]): if not features: features = cls.feature_words return dict((word, int(word in text)) for word in features) # frequency values, normalized (how many times a feature word occurs in text, normalized by text length) @classmethod def features_frequency_normalized(cls, text, features=[]): if not features: features = cls.feature_words # multiply normalized frequency count by 1000 to avoid very small numbers return dict((word, 1000.0*text.count(word)/float(len(text))) for word in features) # frequency values, not normalized @classmethod def features_frequency(cls, text, features=[]): if not features: features = cls.feature_words return dict((word, text.count(word)) for word in features) # takes as input lists of (text, label) pairs for each class (1/2), for training/testing def __init__(self, train_set_class1, train_set_class2, test_set_class1, test_set_class2): self.train_set_class1 = train_set_class1 self.train_set_class2 = train_set_class2 self.test_set_class1 = test_set_class1 self.test_set_class2 = test_set_class2 self.train_set = self.train_set_class1 + self.train_set_class2 self.test_set = self.test_set_class1 + self.test_set_class2 # use default feature type to compute list of features and labels for training set and test set self.train_feature_set = [(self.features_frequency_normalized(word_tokenize(text)), label) for (text,label) in self.train_set] self.test_feature_set = [(self.features_frequency_normalized(word_tokenize(text)), label) for (text,label) in self.test_set] # custom feature_sets initialized with defaut self.train_feature_set_custom = self.train_feature_set self.test_feature_set_custom = self.test_feature_set self.feature_words_custom = self.feature_words # recompute featuresets with current parameters def compute_featuresets(self): if (self.feature_type == 0): self.train_feature_set_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set] self.test_feature_set_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.test_set] if (self.feature_type == 1): self.train_feature_set_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set] self.test_feature_set_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.test_set] if (self.feature_type == 2): self.train_feature_set_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set] self.test_feature_set_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.test_set] # compute featuresets with current parameters separately for each (training) class def compute_class_featuresets(self): if (self.feature_type == 0): self.train_feature_set_class1_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class1] self.train_feature_set_class2_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class2] if (self.feature_type == 1): self.train_feature_set_class1_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class1] self.train_feature_set_class2_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class2] if (self.feature_type == 2): self.train_feature_set_class1_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class1] self.train_feature_set_class2_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class2] # mean frequency of feature words in texts def mean_features(self, featureset=[]): mean_features = {} if not featureset: featureset = self.train_feature_set + self.test_feature_set nr_ex = len(featureset) for stopword in featureset[0][0]: mean_features[stopword] = 0 for train_example in featureset: mean_features[stopword] += train_example[0][stopword]/float(nr_ex) return mean_features # top feature words occuring in whole corpus def top_features(self, how_many=127): features = self.mean_features() # return same format dictionary only with top occuring features (according to mean_features result) return dict(sorted(features.iteritems(), key=operator.itemgetter(1), reverse = True)[:how_many]) def set_nr_features(self, nr_features): top_swords = self.top_features(nr_features) self.feature_words_custom = top_swords.keys() # recompute featuresets for new feature vector self.compute_featuresets() print 'Nr of features: ', len(self.feature_words_custom) def set_feature_type(self, feature_type): self.feature_type = feature_type self.compute_featuresets() print 'Feature type: ', [name for name, value in self.feature_types.iteritems() if value==feature_type][0] def train_classifier(self, trainset=[], svm_param=1.0): # default train set is class field (all train files) if not trainset: trainset = self.train_feature_set_custom if (self.classifier_type == 0): self.classifier = SklearnClassifier(MultinomialNB()) print "Training Naive Bayes classifier..." if (self.classifier_type == 1): self.classifier = SklearnClassifier(LinearSVC(penalty='l2', loss='l2', dual=False, C=svm_param, class_weight='auto')) print "Training Linear SVM classifier..." if (self.classifier_type == 2): self.classifier = SklearnClassifier(SVC(kernel='poly', C=svm_param, class_weight='auto')) print "Training Polynomial SVM classifier..." self.classifier.train(self.train_feature_set_custom) def testall_accuracy(self, testset=[]): # default test set is class field (all test files) if not testset: testset = self.test_feature_set_custom print 'Measuring classifier performance...' acc = accuracy(self.classifier, self.test_feature_set_custom) print 'Overall accuracy:', acc return acc def results_per_file(self, filenames=[]): # if no filenames are given as parameters just use numbers from 1 to nr_of_files if not filenames: filenames = range(len(self.test_feature_set_custom) + 1)[1:] print 'Results per file:' findex = 0 # first index - element to be tested # second index - 0 = index of feature dictionary for text in self.test_feature_set_custom: predicted_label = self.classifier.classify(text[0]) actual_label = text[1] print filenames[findex], predicted_label, predicted_label == actual_label findex += 1 def classify_this(self, text): return self.classifier.classify(text) # TODO: easier computation of top_features and set_feature_nr. \ # maybe just get first elements of sorted featuresets, not compute them again everytime def leave_one_out(self, feature_type=2, classifier_type=1, C=1.0, nr_features=127): print '\nCross-validating with leave-one-out...' # set parameters # if (nr_features != 127): # self.set_nr_features(nr_features) # if (feature_type !=2): # self.set_feature_type(feature_type) # faster: don't recompute featuresets everytime: self.feature_type = feature_type if (nr_features != 127): top_swords = self.top_features(nr_features) self.feature_words_custom = top_swords.keys() if (nr_features != 127 or feature_type != 2): self.compute_featuresets() print '\nNr features: ', nr_features print 'Feature type: ', \ [name for name, value in self.feature_types.iteritems() if value==feature_type][0], '(%d)'%feature_type, '\n' self.classifier_type = classifier_type # cross-validate nrcorrect = 0 total = len(self.train_feature_set_custom) for i in range (total): trainset = self.train_feature_set_custom[:i] + self.train_feature_set_custom[i+1:] self.train_classifier(trainset=trainset, svm_param=C) label = self.classify_this(self.train_feature_set_custom[i][0]) print 'Testing on: file', i+1 print 'actual: ', self.train_feature_set_custom[i][1] print 'predicted: ', label print "--------------------------------" if (label== self.train_feature_set_custom[i][1]): nrcorrect += 1 print 'Correctly classified: ', nrcorrect, '/', total, '\n' return float(nrcorrect)/total # cross-validate results with leave-one-out for different parameters def cross_validate(self, validate_type=0): # validate_type = # 0: nr of features # 1: feature_type # 2: classifier_type # 3: classifier_parameter if (validate_type==0): # cross-validate for nr of features: # [stopwords, accuracies] = self.nrstopwords_experiment(True) # results = dict((stopwords[i], accuracies[i]) for i in range(len(stopwords))) nr_stopwords = range(1,100,10) results = dict((nr,0) for nr in nr_stopwords) for nr in nr_stopwords: acc = self.leave_one_out(nr_features=nr) results[nr] = acc # TODO: strange results for this? too accurate; weak methods too successful if (validate_type==1): # cross-validate for feature type results = dict((feat,0) for feat in self.feature_types) for feat in self.feature_types: acc = self.leave_one_out(feature_type=self.feature_types[feat]) results[feat] = acc if (validate_type==2): # cross-validate for classifier type results = dict((cl, 0) for cl in self.classifier_types) for cl in self.classifier_types: acc = self.leave_one_out(classifier_type=self.classifier_types[cl]) results[cl] = acc if (validate_type==3): # cross-validate for classifier parameter Cs = [10**(-10), 10**(-5), 10**(-3), 10**(-1), 1.0, 1.5, 10, 100, 1000, 10**5, 10**10] results = dict((C, 0) for C in Cs) for C in Cs: acc = self.leave_one_out(C=C) results[C] = acc return results # accuracy vs number of stopwords used def nrstopwords_experiment(self, validate=False): #TODO: not sure about these results, maybe test some more accuracies = [] for nr_stopwords in range(1,10): if (validate): acc = self.leave_one_out(nr_features=nr_stopwords) else: self.set_nr_features(nr_stopwords) self.train_classifier() acc = self.testall_accuracy() accuracies.append(acc) print nr_stopwords, acc for nr_stopwords in range(10,40,5): if (validate): acc = self.leave_one_out(nr_features=nr_stopwords) else: self.set_nr_features(nr_stopwords) self.train_classifier() acc = self.testall_accuracy() accuracies.append(acc) print nr_stopwords, acc for nr_stopwords in range(40,127,25): if (validate): acc = self.leave_one_out(nr_features=nr_stopwords) else: self.set_nr_features(nr_stopwords) self.train_classifier() acc = self.testall_accuracy() accuracies.append(acc) print nr_stopwords, acc stopwords = range(1,10) + range(10,40,5) + range(40,127,25) return [stopwords, accuracies] def plot_stopwords_vs_accuracy(self, validate=False): [stopwords, accuracies] = self.nrstopwords_experiment(validate) plt.plot(stopwords, accuracies, label='Circle') plt.xlabel('Nr of stopwords') plt.ylabel('Accuracy') plt.title('Performance of algorithm versus number of stopwords used in classification') plt.show() # save to disk #plt.savefig('stopwords_experiment2.png') def plot_featureword_distribution(self, nr_swords=25): self.compute_class_featuresets() # for test set and each class of train set Utils.bar_graph(self.mean_features(self.test_feature_set_custom), graph_title='%d stop words for test set - mean occurences'%nr_swords, output_name='test%d.png'%nr_swords) Utils.bar_graph(self.mean_features(self.train_feature_set_custom), graph_title='%d stop words for train set - mean occurences'%nr_swords, output_name='train%d.png'%nr_swords) Utils.bar_graph(self.mean_features(self.train_feature_set_class1_custom), graph_title='%(nr)d stop words for %(class)s set - mean occurences'%{'class':self.train_feature_set_class1_custom[0][1], 'nr':nr_swords}, output_name='hamilton%d.png'%nr_swords) Utils.bar_graph(self.mean_features(self.train_feature_set_class2_custom), graph_title='%(nr)d stop words for %(class)s set - mean occurences'%{'class':self.train_feature_set_class2_custom[0][1], 'nr':nr_swords}, output_name='madison%d.png'%nr_swords)
neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:] neg_feats_train = get_train_features_from_tweets(neg_train, 'neg') pos_feats_train = get_train_features_from_tweets(pos_train, 'pos') train_feats = neg_feats_train + pos_feats_train svm_classifier = SklearnClassifier(LinearSVC()) svm_classifier.train(train_feats) # Evaluation correct, wrong = 0, 0 for tweet in neg_test: features = get_features_from_tweet(tweet) result = svm_classifier.classify(features) if result == "neg": correct += 1 else: wrong += 1 for tweet in pos_test: features = get_features_from_tweet(tweet) result = svm_classifier.classify(features) if result == "pos": correct += 1 else: wrong += 1 print "Accuracy: {}".format(correct / float(correct + wrong))
""" This is a demo of the Scikit-learn Classifier from the NLTK package using the movie reviews corpus """ from nltk.corpus import movie_reviews from featx import * from nltk.classify.scikitlearn import SklearnClassifier from sklearn.linear_model import LogisticRegression from nltk.classify.util import accuracy from nltk import word_tokenize lfeats = label_feats_from_corpus(movie_reviews)# extracts the features and its labels (neg/pos) associated with each tweets train_feats,test_feats = split_label_feats(lfeats, split = 0.75) # splits labeled feature sets into training and test feats see featx.py sk_classifier = SklearnClassifier(LogisticRegression())# trains classifier sk_classifier.train(train_feats) print("The associated accuracy for this classfier on the data is :" ) print(accuracy(sk_classifier,test_feats)) while True: text = input("Enter your fake tweet use only words: \n") test = bag_of_words(word_tokenize(text)) # converts text into a bag of words see featx.py print("Sentiment:") print(sk_classifier.classify(test)) control = input("press aney key to continue 'q' to quit:") if(control == "q" ): break
classif = SklearnClassifier(pipeline) classif.train(zip(trainData,trainLabels)) cf = None if USE_CHI_SQUARE: cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large","w") else: cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large_nochi","w") pickle.dump(classif, cf) matches = 0 mismatches = 0 scores = {1:0, 2:0, 3:0, 4:0, 5:0} for i in range(len(testLabels)): label = classif.classify(testData[i]) log("test data id: "+str(i),f) if label == testLabels[i]: matches += 1 log("matched: label: "+str(label),f) else: mismatches += 1 log("mismatched: label: "+str(label)+" was supposed to be: "+str(testLabels[i]),f) scores[int(label)]+=1 log("summary of results for: gram: "+str(gram) +" size: "+str(size),f) log("matches = "+str(matches),f) log("mismatches = "+str(mismatches),f) log("guesses = "+repr(scores),f) log("="*20,f) log("="*20,f) log("="*20,f)
# # post.get('class') is the label of the current post # featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class')))) # print featuresets[0] def preprocess(sentence): tokens = nltk.word_tokenize(sentence) tokens = [w for w in tokens if w not in stopwords.words("english")] features = {} for token in tokens: features[token] = tokens.count(token) return features featureset = [] sentences = [ "hello there, how are you? Are you very happy??", "Yammering on all the time, what a loser" ] for sentence in sentences: features = preprocess(sentence) featureset.append(features) cls = SklearnClassifier(LinearSVC()) featuresets = [] featuresets.append((featureset[0], "first")) featuresets.append((featureset[1], "second")) cls.train(featuresets) print cls.classify(preprocess("hello there, friends"))
class RForests(text_classifier.TextClassifier): def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1): self.classifier = None self.labelFile = labelFile self.trainingDir = trainDir self.labels = None self.all_words = None self.numTrees = numTrees self.numJobs = numJobs self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees, n_jobs=numJobs),sparse=False) #self.labels = training.setup(labelFile) #self.train() def train(self): feature_sets = self.getFeatures() self.classifier.train(feature_sets) """ Determines training error""" def trainingError(self): feature_sets = self.getFeatures() p = nltk.classify.accuracy(self.classifier,feature_sets) return p """ Make sure that the algorithm works on training data using a k fold cross validation scheme """ def kfoldCrossValidation(self,k): feature_sets = self.getFeatures() error = 0 for i in range(k): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) n = len(feature_sets)/k train_set,test_set = feature_sets[:n*i],feature_sets[n*i:] test_set1 = feature_sets[:n*i] train_set = feature_sets[n*i:n*(i+1)] test_set2 = feature_sets[i+1:] test_set = test_set1+test_set2 self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) return p """ Make sure that the algorithm works on training data using a leave one out cross validation scheme """ def leave1OutCrossValidation(self): error = 0 feature_sets = self.getFeatures() N = len(feature_sets) for i in range(N): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:] train_set = train_set1+train_set2 test_set = [test_set] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) error+=p return error/N """ Construct a learning curve to see if there is overfitting""" def learningCurve(self,numTrials=4): accuracies = [] feature_sets = self.getFeatures() for k in xrange(1,len(feature_sets)-1): total = 0 for i in xrange(numTrials): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees), sparse=False) random.shuffle(feature_sets) train_set,test_set = feature_sets[:k],feature_sets[k:] self.classifier.train(train_set) p = nltk.classify.accuracy(self.classifier,test_set) print len(train_set),len(test_set),p total+=p accuracies.append(total/numTrials) return accuracies """ Train on only k features and return training labels and predicted labels """ def testClassify(self,k): feature_sets = self.getFeatures() random.shuffle(feature_sets) self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) self.classifier.train(feature_sets[k:]) features,ref_labels = zip(*feature_sets[:k]) pred_labels = self.classifier.batch_classify(features) return ref_labels,pred_labels """ nltk confusion matrix """ def confusionMatrix(self,ref,test): ref.sort(key=lambda x: x[0]) test.sort(key=lambda x: x[0]) _,ref_labels = zip(*ref) _,test_labels = zip(*test) cm = ConfusionMatrix(ref_labels, test_labels) return cm def prob_classify(self,db,fastain): proIDs,pds,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") proteinID = toks[5] query_rows = genbank.proteinQuery(proteinID,db) ids,text = zip(*query_rows) text = ''.join(map(str,text)) if text=='': label = ['na'] pd = None else: text = word_reg.findall(text) featureset = self.gene_features(text) assert text!=prevText assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.batch_classify(featureset) pd = self.classifier.prob_classify([featureset])[0] proIDs.append(proteinID) pds.append(pd) labels+=label return proIDs,labels,pds def classifyPickle(self,pickle,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' gbkTable = genbank.GenBankTable() gbkTable.load(pickle) for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") locus_tag = toks[5] text = gbkTable.getLocusText(locus_tag) if text=='': label = 'na' else: text = word_reg.findall(text) featureset = self.gene_features(text) #assert text!=prevText #assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.classify(featureset) #print label,text proIDs.append(locus_tag) labels.append(label) return zip(proIDs,labels) """ Classifies proteins based on its text from sqlite3 database""" def classifyDB(self,db,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") locus_tag = toks[5] locus_rows = genbank_sqlite3.locusQuery(locus_tag,db) protein_rows = [] for row in locus_rows: locus,proteinID = row query_rows = genbank_sqlite3.proteinQuery(proteinID,db) protein_rows+=query_rows #print len(protein_rows),locus_tag if len(protein_rows)==0: label = 'na' else: ids,text = zip(*protein_rows) text = ''.join(map(str,text)) if text=='': label = 'na' else: text = word_reg.findall(text) featureset = self.gene_features(text) #assert text!=prevText #assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.classify(featureset) #print label,text proIDs.append(locus_tag) labels.append(label) return zip(proIDs,labels) def classify(self,dbin,fastain,type='sqlite3'): if type=='sqlite3': return self.classifyDB(dbin,fastain) else: return self.classifyPickle(dbin,fastain)
# 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'] # featuresets = [] # list of tuples of the form (post, features) # for post in posts: # applying the feature extractor to each post # # post.get('class') is the label of the current post # featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class')))) # print featuresets[0] def preprocess(sentence): tokens = nltk.word_tokenize(sentence) tokens = [w for w in tokens if w not in stopwords.words("english")] features = {} for token in tokens: features[token]=tokens.count(token) return features featureset = [] sentences = [ "hello there, how are you? Are you very happy??", "Yammering on all the time, what a loser" ] for sentence in sentences: features = preprocess(sentence) featureset.append(features) cls = SklearnClassifier(LinearSVC()) featuresets = [] featuresets.append((featureset[0],"first")) featuresets.append((featureset[1],"second")) cls.train(featuresets) print cls.classify(preprocess("hello there, friends"))
elif(x==1): str='Bernoulli Naive Baeyes' elif(x==2): str='Logistic Regression' else: str='Support Vector' print(str,'classifier accuracy :',accuracy_score(ground_truth,predictions)) print(str,'f1 score :',f1_score(ground_truth,predictions)) """So we clearly see that Logistic regression classifier and Support vector perfectly classifies our dataset But since SVM has more f1-score so we will make predictions based on SVM """ predictions=[] for r in testing_set: predictions.append(SVC_clf.classify(r[0])) print(predictions) positive=0 negative=0 for i in range(0,len(predictions)): if predictions[i]==1: positive=positive+1 else: negative=negative+1 print(positive,negative) """Sentimental Analysis""" positive_percent=positive/(positive+negative)
car_counter = 0 print("Started classification of youtube comments", datetime.utcnow()) results = {} with open(os.path.join("..", "youtube-comments", "carwow-comments", "all-comments.json"), "r", encoding="UTF-8") as f: cars = json.load(f) for car in cars: car_counter += 1 results[car] = {} comments = cars[car] for comment in comments: category = classifier.classify( comment_to_feature_set(comment["text"])) if category in results[car]: results[car][category] += 1 else: results[car][category] = 1 print("(#" + str(car_counter) + ")", "Classification of comments for", car, "done") # with open(os.path.join("..", "youtube-comments", "carwow-comments", "3-category-classification.json"), "w", encoding="UTF-8") as f: with open(os.path.join("..", "youtube-comments", "carwow-comments", "5-category-classification.json"), "w", encoding="UTF-8") as f: json.dump(results, f, indent=2, sort_keys=True) print("Classification of youtube comments complete", datetime.utcnow())
def main(): parser = get_argparser() args = parser.parse_args() util.DPRINT = args.dprint featureset_name = os.path.basename(args.featurefn).split('.')[0] features.load_featurefile(args.featurefn) ## default is 1e-4. THETOL = 1e-3 classifier_pairs = [] classifier_pairs.append(("MFS", learn.MFSClassifier())) classifier = SklearnClassifier(LogisticRegression(C=1, penalty='l2', tol=THETOL)) classifier_pairs.append(("maxent-l2-c1", classifier)) stamp = util.timestamp() for fn in glob(args.testset + "/*data"): problems = semeval_testset.extract_wsd_problems(fn) w = problems[0][0] assert w.endswith(".n") w = w[:-2] load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn) bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es") oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es") if os.path.exists(bestoutfn): os.remove(bestoutfn) if os.path.exists(oofoutfn): os.remove(oofoutfn) training = None for problem in problems: w = problem[0] assert w.endswith(".n") w = w[:-2] print(problem) if training is None: training = trainingdata.trainingdata_for(w, nonnull=True) print("got {0} instances for {1}".format(len(training), w)) labels = set(label for (feat,label) in training) if len(training) == 0: print("no samples for", w) break if len(labels) < 2: print("there's only one sense for", w, " and it is ", labels) break classifier.train(training) rawtext = problem[2] surface, index = semeval_testset.head_surface_and_index(rawtext) replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext) annotated = preprocessing.preprocess(replaced, "en") sentence = [token.lemma for token in annotated] focus_index = find_head_token_index(annotated, surface, index) feats = features.extract_untagged(sentence, annotated, focus_index) bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es") oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es") with open(bestoutfn, "a") as bestoutfile, \ open(oofoutfn, "a") as oofoutfile: answer = classifier.classify(feats) print(answer) dist = classifier.prob_classify(feats) oof_answers = topfive(dist) print(output_one_best(problem, "es", answer), file=bestoutfile) print(output_five_best(problem, "es", oof_answers), file=oofoutfile)
def CrossValidation(train_group, n=5): """Do Cross Validation of different classifiers For classifiers, I've written my own NaiveBayes Classifier and I also considered several available classifiers in nltk and sklearn like ['Maximum Entropy', 'DecisionTree', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC']. I want to compare performances of these classifiers and ouput their accuracy, precision, recall, F1. Different from Singlefold, cross validation can be more accurate and avoid overfitting. Args: train_group: The original training set contains all the news related with the stock and its label. For example: ([[title1],[content1],[title2],[content2],...],'+1') n: How many folds you want. Default: 5. Returns: It doesn't return things, instead it prints the result. For each classifier, for example: --------------------------------------- N-FOLD CROSS VALIDATION RESULT (NaiveBayes) --------------------------------------- accuracy: 0.6479463537300922 precision 0.6505853139411139 recall 0.965771458662454 f-measure 0.7774480712166171 """ print('Preparing...') random.shuffle(train_group) classifier_list = [ 'NaiveBayes', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC' ] # 'Maximum Entropy', 'DecisionTree'] for cl in classifier_list: subset_size = int(math.floor(len(train_group) / n)) accuracy = [] precision = [] recall = [] F1 = [] classifier = SklearnClassifier(NuSVC()) for i in range(n): testing_this_round = train_group[i * subset_size:][:subset_size] training_this_round = train_group[:i * subset_size] + train_group[ (i + 1) * subset_size:] train_set, test_set = PrepareSets(training_this_round, testing_this_round) if cl == 'NaiveBayes': print('Training ' + cl + ' ' + str(i) + ' fold') classifier = nltk.NaiveBayesClassifier.train(train_set) # elif cl == 'Maximum Entropy': # print('Training ' + cl + ' ' + str(i) + ' fold') # classifier = nltk.MaxentClassifier.train(train_set, 'GIS', trace=0) elif cl == 'BernoulliNB': classifier = SklearnClassifier(BernoulliNB()) print('Training ' + cl + ' ' + str(i) + ' fold') classifier.train(train_set) elif cl == 'LogisticRegression': classifier = SklearnClassifier(LogisticRegression()) print('Training ' + cl + ' ' + str(i) + ' fold') classifier.train(train_set) elif cl == 'SVC': classifier = SklearnClassifier(LinearSVC()) print('Training ' + cl + ' ' + str(i) + ' fold') classifier.train(train_set) elif cl == 'LinearSVC': classifier = SklearnClassifier(LinearSVC()) print('Training ' + cl + ' ' + str(i) + ' fold') classifier.train(train_set) else: # cl == 'NuSVC': classifier = SklearnClassifier(NuSVC()) print('Training ' + cl + ' ' + str(i) + ' fold') classifier.train(train_set) # else: # print('Training ' + cl + ' ' + str(i) + ' fold') # classifier = nltk.DecisionTreeClassifier.train(train_set) # print(classifier.show_most_informative_features(10)) print('Testing...') TP = 0 FN = 0 FP = 0 TN = 0 for i, (feats, label) in enumerate(test_set): observed = classifier.classify(feats) if label == '+1' and observed == '+1': TP += 1 elif label == '-1' and observed == '+1': FP += 1 elif label == '+1' and observed == '-1': FN += 1 elif label == '-1' and observed == '-1': TN += 1 accuracy.append((TP + TN) / len(test_set)) recall.append(TP / (TP + FN)) precision.append(TP / (TP + FP)) F1.append(2 * (TP / (TP + FP)) * (TP / (TP + FN)) / (TP / (TP + FP)) + (TP / (TP + FN))) pickle.dump(classifier, open('./' + cl + '.pkl', 'wb')) print('') print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + cl + ')') print('---------------------------------------') print('accuracy:', np.mean(accuracy)) print('precision', np.mean(precision)) print('recall', np.mean(recall)) print('f-measure', np.mean(F1)) print('\n')
columns = [['predicted', 'predicted'], ['not mortality', 'mortality']])) df_confusion = pd.crosstab(labels, prediction) df_norm = df_confusion.values / df_confusion.sum(axis=1)[:,None] ax = sn.heatmap(df_norm, annot=True, annot_kws={"size": 20}, cmap="YlGnBu") plt.xlabel('Predicted label', fontsize=20) plt.ylabel('True label', fontsize=20) plt.title('Confusion Matrix, w/Normalization', fontsize=20) plt.show() test_true, test_pred = [], [] for i, (features, label) in enumerate(testing): test_true.append(label) observed = nltk_ensemble.classify(features) test_pred.append(observed) # need to use precision and recall instead to see false-positive rates average_precision = metrics.average_precision_score(test_true, test_pred) precision, recall, thresholds = metrics.precision_recall_curve(test_true, test_pred) f1 = metrics.f1_score(test_true, test_pred) auc = metrics.auc(recall, precision) print('Average precision-recall score: {0:0.2f}'.format(average_precision)) print('F1 score: {0:0.2f}'.format(f1)) print('AUC: {0:0.2f}'.format(auc)) plt.plot([0,1], [0.5,0.5], linestyle='--') plt.plot(recall, precision, marker='.') plt.xlabel('Recall')
'plot': (float(plot) / len(words)), 'theme': (float(theme) / len(words)) } maximumNB = max(statsNB.items(), key=operator.itemgetter(1))[0] print(maximumNB, statsNB.pop(maximumNB)) print( '--------------------------------------------------------------------------------' ) print('LogisticRegression Classifier') actor = 0 plot = 0 theme = 0 for word in new_words: LRResult = LR_classifier.classify(word_feats(word)) # print(word,classResultSK) if LRResult == 'actor': actor = actor + 1 if LRResult == 'plot': plot = plot + 1 if LRResult == 'theme': theme = theme + 1 statsLR = { 'actor': (float(actor) / len(words)), 'plot': (float(plot) / len(words)), 'theme': (float(theme) / len(words)) } maximumLR = max(statsLR.items(), key=operator.itemgetter(1))[0] print(maximumLR, statsLR.pop(maximumLR))
#%% Now we can see how we are doing via the various metrics classifier = SklearnClassifier(knn(n_neighbors=17)) classifier.train(train) referenceSets = {} referenceSets['pos'] = set() referenceSets['neg'] = set() testSets = {} testSets['pos'] = set() testSets['neg'] = set() shuffle(test) for i, (features, label) in enumerate(test): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) print 'After training on %d samples, start to test on %d instances:' % ( len(train), len(test)) print 'accuracy: %.2f' % nltk.classify.util.accuracy(classifier, test) print 'pos precision: %.2f' % nltk.precision(referenceSets['pos'], testSets['pos']) print 'neg precision: %.2f' % nltk.precision(referenceSets['neg'], testSets['neg']) print 'pos recall: %.2f' % nltk.recall(referenceSets['pos'], testSets['pos']) print 'neg recall: %.2f' % nltk.recall(referenceSets['neg'], testSets['neg']) testSets['neg'].clear() testSets['pos'].clear()
correct, total = 0., 0. no_class_pos = 0. no_class_neg = 0. no_class_neu = 0. no_result_pos = 0. no_result_neg = 0. no_result_neu = 0. true_pos = 0. true_neg = 0. true_neu = 0. correct_class_pos = 0. correct_class_neg = 0. correct_class_neu = 0. for (tweet, class_type, original_tweet_string) in test_tweet_list: result = classifier.classify(get_feature_mapping(tweet)) total += 1 if class_type == result: correct += 1 if class_type == 'positive': no_class_pos += 1 if result == 'positive': correct_class_pos += 1; if result == 'positive': no_result_pos += 1 if class_type == 'positive': true_pos += 1 if class_type == 'negative': no_class_neg += 1 if result == 'negative': correct_class_neg += 1; if result == 'negative':
class DocumentClassifier(): ''' Train a classifier with labeled documents and classify new documents into one of the labeled clases. We call 'dev docs' to the documents set provided for training the classifier. These 'dev docs' are splitted into two sub sets: 'train docs' and 'test docs' that would be used to train and test the machine learning model respectively. Parameters ---------- train_p : float, 0.8 by default The proportion of the 'dev docs' used as 'train docs' Use values greater than 0 and lower than 1. The remaining docs will be using as 'test docs' eq_label_num : boolean, True by default If true, 'train docs' will have equal number of documents for each class. This number will be the lowest label count. complete_p : boolean, True by default Used when eq_label_num is True, but the lowest label count is not enough for getting the train_p proportion of 'train docs'. If this attribute is True, more documents from 'test docs' will be moved to 'train docs' until we get train_p n_folds : integer, 10 by default Number of folds to be used in k-fold cross validation technique for choosing different sets as 'train docs' vocab_size : integer, 500 by default This is the size of the vocabulary set that will be used for extracting features out of the docs t_classifier : string, 'NB' by default This is the type of classifier model used. Available types are 'NB' (Naive Bayes), 'DT' (decision tree), 'RF' (Random Forest), and 'SVM' (Support Vector Machine) language: string, 'english' by default Language on which documents are written stem: boolean, False by deafault If True, stemming is applied to feature extraction train_method: string, 'all_class_train' by default Choose the method to train the classifier. There are two options: 'all_class_train' and 'cross_validation' ''' def __init__(self, train_p=0.8, eq_label_num=True, complete_p=True, n_folds=10, vocab_size=250, t_classifier="NB", language="english", stem=False, train_method="all_class_train"): self.train_p = train_p self.eq_label_num = eq_label_num self.complete_p = complete_p self.n_folds = n_folds self.vocab_size = vocab_size self.t_classifier = t_classifier self.language = language self.stem = stem self.train_method = train_method self._vocab = [] self._classified_docs = [] self._classifier = None self._accuracy = 0 self._precision = {} self._recall = {} self._f_measure = {} self._train_docs = [] self._test_docs = [] def split_train_and_test(self, docs): ''' Split the 'dev docs' set into the 'train docs' and 'test docs' subsets Parameters ---------- docs: iterable An iterable which yields a list of strings ''' categories_count = self.count_categories(docs) label_limit = min([c for (k, c) in categories_count.items()]) labeled_docs = {} train_docs = [] test_docs = [] # Split docs by label for (cat, count) in categories_count.items(): labeled_docs[cat] = shuffled([t for (t, k) in docs if k == cat]) if self.eq_label_num: # Select the same number of doc for all labels for cat, cat_docs in labeled_docs.items(): cat_limit = label_limit cat_train_docs = cat_docs[:cat_limit] cat_test_docs = cat_docs[cat_limit:] train_docs += [(doc, cat) for doc in cat_train_docs] test_docs += [(doc, cat) for doc in cat_test_docs] l_train = len(train_docs) l_docs = len(docs) l_test = len(test_docs) actual_p = l_train / l_docs # If the training proportion is not if self.complete_p == True and actual_p < self.train_p: shuffled_extra = shuffled(test_docs) extra_i = 0 while (actual_p < self.train_p and extra_i < l_test): aux_l_train = l_train + extra_i actual_p = aux_l_train / l_docs extra_i += 1 train_docs += shuffled_extra[:extra_i] test_docs = shuffled_extra[extra_i:] else: label_limit = int(self.train_p * len(docs)) shuffled_docs = shuffled(docs) train_docs = shuffled_docs[:label_limit] test_docs = shuffled_docs[label_limit:] self._train_docs = train_docs self._test_docs = test_docs def cross_validation_train(self, dev_docs): ''' Applies k-fold cross validation technique to split the docs into different pairs of training and testing sets. For each pair, it trains and evals the a classifier, choosing the one with the best accuracy Parameters ---------- dev_docs: iterable An iterable which yields a list of strings ''' dev_docs = shuffled(dev_docs) accuracies = [] best_accuracy = 0 subset_size = int(len(dev_docs) / self.n_folds) for i in range(self.n_folds): classifier_list = [] train_docs = (dev_docs[(i + 1) * subset_size:] + \ dev_docs[:i * subset_size]) test_docs = dev_docs[i * subset_size:(i + 1) * subset_size] train_set = apply_features(self.get_doc_features, train_docs) if self.t_classifier == "NB": classifier = NaiveBayesClassifier.train(train_set) elif self.t_classifier == "DT": classifier = DecisionTreeClassifier.train(train_set) elif self.t_classifier == "RF": classifier = SklearnClassifier(RandomForestClassifier())\ .train(train_set) elif self.t_classifier == "SVM": classifier = SklearnClassifier(LinearSVC(), sparse=False)\ .train(train_set) classifier_list.append(classifier) test_set = apply_features(self.get_doc_features, test_docs, True) accuracies.append((accuracy(classifier, test_set)) * 100) if accuracies[-1] > best_accuracy: best_accuracy = accuracies[-1] self._classifier = classifier self._train_docs = train_docs self._test_docs = test_docs def all_class_train(self, dev_docs): ''' Train classifier with train_p percentage of all classes. The remaining docs of each class is used for testing. Parameters ---------- dev_docs: iterable An iterable which yields a list of strings ''' categories_count = self.count_categories(dev_docs) labeled_docs = {} for (cat, count) in categories_count.items(): labeled_docs[cat] = shuffled( [t for (t, k) in dev_docs if k == cat]) train_docs = [] test_docs = [] for cat, l in labeled_docs.items(): cat_limit = int(self.train_p * len(l)) train_docs += [(t, cat) for t in l[:cat_limit]] test_docs += [(t, cat) for t in l[cat_limit:]] self._train_docs = train_docs self._test_docs = test_docs train_set = apply_features(self.get_doc_features, self._train_docs) # create and train the classification model according to t_classifier if self.t_classifier == "NB": self._classifier = NaiveBayesClassifier.train(train_set) elif self.t_classifier == "DT": self._classifier = DecisionTreeClassifier.train(train_set) elif self.t_classifier == "RF": self._classifier = SklearnClassifier(RandomForestClassifier())\ .train(train_set) elif self.t_classifier == "SVM": self._classifier = SklearnClassifier(LinearSVC(), sparse=False)\ .train(train_set) def count_categories(self, docs): ''' Count how many documents of each class are in the 'dev docs' set Parameters ---------- docs: iterable An iterable which yields a list of strings Returns ------- counters: dictionary A dictiionary where each item is the number of docs for a class ''' categories = set([c for (t, c) in docs]) counters = {} for cat in categories: counters[cat] = 0 for (text, cat) in docs: counters[cat] += 1 self._categories = sorted(categories) return counters def get_doc_features(self, doc): ''' Extract features of a document, checking the presence of the words in the vocabulary Parameters ---------- doc: string The doc from which features will be extracted Returns ------- features: dictionary A dictionary where each item indicates the presence of a word from the vocabulary in the input doc ''' features = {} for word in self._vocab: features['contains({})'.format(word)] = (word in doc) return features def train_classifier(self, dev_docs): ''' Create the features vocabulary from 'dev docs', Split 'dev docs', train the classifier with 'train docs', Evaluate accuracy with 'test docs' Parameters ---------- dev_docs: iterable An iterable which yields a list of strings ''' # create vocabulary for feature extraction ce = ConceptExtractor(num_concepts=self.vocab_size, language=self.language, pos_vec=['NN', 'NNP', 'NNS', 'NNPS']) ce.extract_concepts([t for (t, c) in dev_docs]) self._vocab = sorted([c for (c, f) in ce.common_concepts], key=str.lower) if (self.stem): self._vocab = [tokenize_and_stem(w, language=self.language)[0] \ for w in self._vocab] if self.train_method == "cross_validation": self.cross_validation_train(dev_docs) elif self.train_method == "all_class_train": self.all_class_train(dev_docs) def eval_classifier(self): ''' Test the model and calculates the metrics of accuracy, precision, recall and f-measure ''' test_set = apply_features(self.get_doc_features, self._test_docs, True) self._accuracy = accuracy(self._classifier, test_set) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = self._classifier.classify(feats) testsets[observed].add(i) self.count_categories(self._train_docs) for cat in self._categories: self._precision[cat] = precision(refsets[cat], testsets[cat]) self._recall[cat] = recall(refsets[cat], testsets[cat]) self._f_measure[cat] = f_measure(refsets[cat], testsets[cat]) def classify_docs(self, docs): ''' First train the classifier with the labeled data. Then classifies the unlabeled data. Parameters ---------- docs: iterable An iterable which yields a list of strings ''' dev_docs = [(t, c) for (t, c) in docs if c != ""] unlabeled_docs = [t for (t, c) in docs if c == ""] self.train_classifier(dev_docs) self.eval_classifier() results = [] for doc in unlabeled_docs: doc_feats = self.get_doc_features(doc) result = self._classifier.classify(doc_feats) results.append((doc, result)) self._classified_docs = results self._final_cat_count = self.count_categories(dev_docs + results) @property def classified_docs(self): return self._classified_docs @property def accuracy(self): return self._accuracy @property def precision(self): return self._precision @property def recall(self): return self._recall @property def f_measure(self): return self._f_measure @property def category_count(self): return self._final_cat_count
train_set = data[:slice] test_set = data[slice:] # train classification models print 'Training models on', len(train_set), 'data samples...' nb = NaiveBayesClassifier.train(train_set) lr = SklearnClassifier(LogisticRegression()).train(train_set) kwfc = KeywordFrequencyClassifier() kwfc.train(train_set) # calculate and report model accuracy print '\nKey Word Frequency Classifier accuracy based on', len( test_set), 'samples:' print kwfc.accuracy(test_set) print '\nNaive Bayes accuracy based on', len(test_set), 'samples:' print nltk.classify.util.accuracy(nb, test_set) print '\nLogistic Regression accuracy based on', len(test_set), 'samples:' print nltk.classify.util.accuracy(lr, test_set) # an example sample_post = 'How many numbers less than 70 are relatively prime to it?' test = util.features(sample_post) # attempt to classsify sample sentence print '\nAn Example:\n', sample_post print 'Naive Bayes:', nb.classify(test) print 'Keyword Classifier', kwfc.predict(test) print 'Logistic Regression:', lr.classify(test)
class BookClassifier: def __init__(self): config = ConfigParser.ConfigParser() config.read("BookClassifier.config") cur_dir = os.getcwd() #Config parameters data_dir = config.get('GLOBAL', 'data_dir') op_dir = config.get('GLOBAL', 'output_dir') train_file = config.get('GLOBAL', 'train_file_name') train_file_2 = config.get('GLOBAL', 'train_file_name_2') self.bigram_threshold = int(config.get('GLOBAL', 'bigram_threshold')) self.k_fold = int(config.get('GLOBAL', 'k_fold')) self.unigram_threshold = int(config.get('GLOBAL', 'unigram_threshold')) self.data_dir = os.path.join(cur_dir, data_dir) self.output_dir = os.path.join(cur_dir, op_dir) self.train_file = os.path.join(self.data_dir, train_file) self.train_file_2 = os.path.join(self.data_dir, train_file_2) self.logger_file = os.path.join(self.output_dir, "BookClassifier.log") self.mode = int(sys.argv[1]) if self.mode == 1: output_file = config.get('GLOBAL', 'output_file_1') elif self.mode ==2: output_file = config.get('GLOBAL', 'output_file_2') self.output_file = os.path.join(self.output_dir, output_file) #Data structures self.stopwords_set = set(stopwords.words('english')) self.toc_list = [] self.training_feats = [] self.test_cases = [] self.book_instances = [] self.selected_features = [] self.book_category_set = set() self.bookid_to_toc_dict = {} #toc - table of contents self.train_file_fd = None self.train_file_2_fd = None self.output_file_fd = None #classifiers self.nb_classifier = None self.svm_classifier = None def initialize_logger(self): logging.basicConfig(filename=self.logger_file, level=logging.INFO) logging.info("Initialized logger") self.logging = logging def run_main(self): self.preprocessing() self.feature_selection() self.feature_extraction() self.classification() self.testing() self.cross_validation() self.close_files() def clean_book_title(self, title): return nltk.word_tokenize(title.translate(None, string.punctuation)) def clean_author_name(self, author): return author.split(";") def feature_extraction(self): #Features are extracted for instance in self.book_instances: try: raw_data = instance and instance.strip() and instance.strip().split("\t") if raw_data and len(raw_data) == 4: bookid = raw_data[0] features = [] features.extend(self.clean_book_title(raw_data[2])) features.extend(self.clean_author_name(raw_data[3])) features.extend(self.bookid_to_toc_dict.get(raw_data[1], [])) train_feats_list = [] for feat in features: if feat and feat.lower() in self.selected_features and feat.lower() not in self.stopwords_set: train_feats_list.append((feat.lower(), True)) train_feats_list.extend(self.get_bigram([pair[0] for pair in train_feats_list if pair])) elif raw_data and len(raw_data) == 3: self.test_cases.append(instance) else: continue self.training_feats.append((dict(train_feats_list), bookid)) except: self.logging.info("Exception while running this instance %s\n" % instance) continue def get_bigram(self, features_list): #Top ten best bigrams are selected score = BigramAssocMeasures.chi_sq all_bigrams = BigramCollocationFinder.from_words(features_list) best_bigrams = all_bigrams.nbest(score, self.bigram_threshold) selected_bigrams = [(bigram, True) for bigram in best_bigrams] return selected_bigrams def classification(self): #Training NB classifier self.nb_classifier = NaiveBayesClassifier.train(self.training_feats) #Training SVM classifier self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(self.training_feats) def testing(self): #Predicting output for the test instances for instance in self.test_cases: try: raw_data = instance.strip() and instance.strip() and instance.strip().split("\t") if raw_data: features = [] train_feats_list = [] features.extend(self.clean_book_title(raw_data[1])) features.extend(self.clean_author_name(raw_data[2])) for feat in features: if feat and feat.lower() not in self.stopwords_set and feat.lower() in self.selected_features: train_feats_list.append((feat.lower(),True)) train_feats_list.extend(self.get_bigram([pair[0] for pair in train_feats_list if pair])) label = self.svm_classifier.classify(dict(train_feats_list)) self.output_file_fd.write("%s\t%s\n" % (raw_data[0], label)) except: self.logging.info("Exception while running this instance %s\n" % instance) def cross_validation(self): #10 fold cross validation is performed train_feats_count = int(len(self.training_feats)) fold_size = int(train_feats_count / self.k_fold) nb_accuracy_list = [] svm_accuracy_list = [] nb_f_val_list = [] svm_f_val_list = [] for a in range(self.k_fold): start_index = a * fold_size end_index = start_index + fold_size train_features = self.training_feats[:start_index] + self.training_feats[end_index:] test_features = self.training_feats[start_index:end_index] self.nb_classifier = NaiveBayesClassifier.train(train_features) nb_acc = nltk.classify.util.accuracy(self.nb_classifier, test_features) nb_accuracy_list.append(nb_acc) self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(train_features) svm_acc = nltk.classify.util.accuracy(self.svm_classifier, test_features) svm_accuracy_list.append(svm_acc) #Find F-Measure nb_f_val = self.compute_measures(test_features, self.nb_classifier) nb_f_val_list.append(nb_f_val) svm_f_val = self.compute_measures(test_features, self.svm_classifier) svm_f_val_list.append(svm_f_val) self.logging.info('Average accuracy of Naive Bayes Classifier %s\n' % (float(sum(nb_accuracy_list)/len(nb_accuracy_list)))) self.logging.info('Average accuracy of SVM Classifier %s\n' % (float(sum(svm_accuracy_list)/len(svm_accuracy_list)))) self.logging.info('Average F measure of Naive Bayes Classifier %s\n' % (float(sum(nb_f_val_list)/len(nb_f_val_list)))) self.logging.info('Average F measure of SVM Classifier %s\n' % (float(sum(svm_f_val_list)/len(svm_f_val_list)))) def compute_measures(self, test_features, classifier): #Average F measure calculation actual_labels, predicted_labels = self.get_actual_and_predicted_labels(test_features, classifier) precision = self.find_precision(actual_labels, predicted_labels) recall = self.find_recall(actual_labels, predicted_labels) f_val = self.find_f_measure(precision, recall) return f_val def find_precision(self, actual_labels, predicted_labels): if not actual_labels and not predicted_labels: return 0 precision_list = [] for category in self.book_category_set: if not predicted_labels.get(category): continue precision = nltk.metrics.precision(actual_labels.get(category, set()), predicted_labels.get(category, set())) precision_list.append(precision) return float(sum(precision_list)/len(precision_list)) def find_recall(self, actual_labels, predicted_labels): if not actual_labels and not predicted_labels: return 0 recall_list = [] for category in self.book_category_set: if not actual_labels.get(category): continue recall = nltk.metrics.recall(actual_labels.get(category, set()), predicted_labels.get(category, set())) recall_list.append(recall) return float(sum(recall_list)/len(recall_list)) def find_f_measure(self, precision, recall): if precision == 0 and recall == 0: return 0 f_val = 2 * (precision * recall) / float(precision + recall) return f_val def get_actual_and_predicted_labels(self, test_features, classifier): actual_labels = {} predicted_labels = {} for i, (features, label) in enumerate(test_features): actual_labels.setdefault(label, set()).add(i) labels = classifier.classify(features) predicted_labels.setdefault(labels, set()).add(i) return (actual_labels, predicted_labels) def preprocessing(self): self.initialize_logger() self.open_files() self.load_data() def feature_selection(self): self.clean_and_structure_toc_data() self.clean_train_data_and_find_best_features() def clean_train_data_and_find_best_features(self): #Top n best unigram features are selected freq_dist_obj = FreqDist() cond_freq_dist_obj = ConditionalFreqDist() self.book_category_set = set() for instance in self.book_instances: try: raw_data = instance and instance.strip() and instance.strip().split("\t") if not raw_data or len(raw_data) != 4 : continue bookid = raw_data[0] self.book_category_set.add(bookid) features = [] features.extend(self.clean_book_title(raw_data[2])) features.extend(self.clean_author_name(raw_data[3])) features.extend(self.bookid_to_toc_dict.get(raw_data[1], [])) for feat in features: freq_dist_obj.inc(feat) cond_freq_dist_obj[bookid].inc(feat) except: self.logging.info("Exception while running this instance %s \n" % instance) total_word_count = 0 for bookid in self.book_category_set: total_word_count += cond_freq_dist_obj[bookid].N() word_score_dict = {} for word, freq in freq_dist_obj.iteritems(): score = 0 if word and word.lower() in self.stopwords_set:continue for bookid in self.book_category_set: score += BigramAssocMeasures.chi_sq(cond_freq_dist_obj[bookid][word], (freq, cond_freq_dist_obj[bookid].N()), total_word_count) word_score_dict[word] = score self.select_top_n_best_features(word_score_dict) def select_top_n_best_features(self, word_score_dict): self.selected_features = sorted(word_score_dict.iteritems(), key=operator.itemgetter(1), reverse=True) total_select_count = int(len(self.selected_features) * self.unigram_threshold/float(100)) self.selected_features = self.selected_features[:total_select_count] self.selected_features = set([pair[0].lower() for pair in self.selected_features if pair[0]]) def clean_book_toc(self, toc): return [word for word in re.sub("[^a-zA-Z]"," ", toc).split(" ") if word] def clean_and_structure_toc_data(self): #Extra training data - table of contents are cleaned and structured for instance in self.toc_list: raw_data = instance and instance.strip() and instance.strip().replace("↵","") if not raw_data:continue bookid = raw_data.split("\t")[0] clean_data = self.clean_book_toc(raw_data) self.bookid_to_toc_dict.setdefault(bookid, []).extend(clean_data[1:]) def open_files(self): self.train_file_fd = open(self.train_file, 'r') self.train_file_2_fd = open(self.train_file_2, 'r') self.output_file_fd = open(self.output_file, 'w') def load_data(self): self.load_train_data() if self.mode == 2: #Load more train data only when it run as problem 2. self.load_more_train_data() def load_train_data(self): #Train data loaded self.book_instances = [] for instance in self.train_file_fd.readlines(): self.book_instances.append(instance) self.book_instances = self.book_instances[1:] def load_more_train_data(self): #More training data are loaded for problem 2 for instance in self.train_file_2_fd.readlines(): self.toc_list.append(instance) self.toc_list = self.toc_list[1:] def close_files(self): self.train_file_fd.close() self.train_file_2_fd.close() self.output_file_fd.close()
pickle.dump(featureList, save_featureList) save_featureList.close() # Create featuresets ------------------------------------------------ count = 0 print("Extract feature vector for all tweets in one shoot") training_set = nltk.classify.util.apply_features(extract_features_2, tweet_dicts) print(training_set) """ # Train the classifier NBClassifier = nltk.NaiveBayesClassifier.train(training_set) # Test the classifier testTweet = 'Congrats @ravikiranj, i heard you wrote a new tech post on sentiment analysis' pre_process_result = pre_process_tweet(testTweet) processedTestTweet = process_tweet(pre_process_result)['featureVector'] #print( NBClassifier.classify(extract_features(processedTestTweet))) #print(NBClassifier.show_most_informative_features(10)) """ LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) save_LinearSVC_classifier = open("pickled/LinearSVC_classifier_3_ways.pickled", "wb") pickle.dump(LinearSVC_classifier, save_LinearSVC_classifier) save_LinearSVC_classifier.close() LSVC_accuracy = nltk.classify.accuracy(LinearSVC_classifier, training_set) print(LSVC_accuracy) print(LinearSVC_classifier.classify(extract_features_2(process_tweet(Tweet7))))
file_path_test = os.path.join( 'C:/Users/DComp2/Desktop/python learn/get_data/training_data', 'test_data.txt') text_file_test = open(file_path_test, "rU") raw_data_test = text_file_test.readlines() text_file_test.close() question_test, coarse_label_test, fine_label_test = get_labels(raw_data_test) labeled_data_test = zip(question_test, coarse_label_test) test_data = question_test print("test_data_length", len(test_data)) predict_labels = [] for que in test_data: test_features = combined_features(que) predicted = SVC_classifier.classify(test_features) predict_labels.append(predicted) import numpy as np true_label = np.array(coarse_label_test) predicted_label = np.array(predict_labels) from sklearn.metrics import precision_recall_fscore_support precision, recall, fscore, support = precision_recall_fscore_support( true_label, predicted_label) from collections import Counter counts = Counter(coarse_label_test) all_set = zip(precision, recall, fscore, support) print for precision, recall, fscore, support in all_set: print 'Precision:', round(precision, 2) print 'Recall:', round(recall, 2)
# print(wordlist.most_common(10)) # print(classifier.show_most_informative_features(32)) # print(extract_features()) # tweet = "'Love-cheat' Daniel Radcliffe splits with girlfriend Rosie Coker: London, Oct 19: Daniel Radcliffe has split wit... http://tinyurl.com/8oxx2ns " # print(classifier.classify(extract_features(tweet.split()))) with open("/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/test_data.tsv", "r") as testfile, open("/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/evalfile.tsv", "w") as evalfile: tsvreader = csv.reader(testfile, dialect='excel-tab',delimiter="\t") evalwriter = csv.writer(evalfile, dialect='excel-tab', delimiter='\t') for line in tsvreader: tweet = line[3] result = classifier.classify(extract_features(tweet.split())) evalwriter.writerow([line[0], line[1], result, line[3]]) evaluator.evaluate("/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/test_data.tsv", "/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/evalfile.tsv") # print(classifier.show_most_informative_features(15))
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] training = [] #process positive dataset "processed_pro_GMO.txt" for i in short_pos.split('\n'): posWords = word_tokenize(i) posWords_tag = [feature_select(posWords),"pos"] #post each word as "pos" in positive dataset posFeatures.append(posWords_tag) #process negative dataset "processed_anti_GMO.txt" for i in short_neg.split('\n'): negWords = word_tokenize(i) negWords_tag = [feature_select(negWords),"neg"] negFeatures.append(negWords_tag) #get 6-Fold cross validation for Accuracy,Recall,Prediction num_folds = 6 training = posFeatures + negFeatures cv = cross_validation.KFold(len(training),n_folds=6, shuffle=True, random_state=None) Naive_Accu = 0 neg_Precision = 0 neg_recall = 0 pos_Precision = 0 pos_recall = 0 SVC_Accu = 0 Regression_Accu = 0 testFeatures = [] precision = dict() recall = dict() average_Precision = dict() for traincv, testcv in cv: #BasedNaiveClassifier BasedNaiveClassifier = NaiveBayesClassifier.train(training[traincv[0]:traincv[len(traincv)-1]]) accuracy = (nltk.classify.util.accuracy(BasedNaiveClassifier, training[testcv[0]:testcv[len(testcv)-1]]))*100 Naive_Accu += accuracy BasedNaiveClassifier.show_most_informative_features(10) save_classifier = open("GMO_Hanzhe/BasedNaiveClassifier10k.pickle","wb") pickle.dump(BasedNaiveClassifier, save_classifier) save_classifier.close() #LogisticRegression LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training[traincv[0]:traincv[len(traincv)-1]]) Regression_Accuracy = (nltk.classify.util.accuracy(LogisticRegression_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100 Regression_Accu += Regression_Accuracy save_classifier = open("GMO_Hanzhe/LogisticRegression_classifier10k.pickle","wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() #LinearSVC LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training[traincv[0]:traincv[len(traincv)-1]]) SVC_Accuracy = (nltk.classify.util.accuracy(LinearSVC_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100 SVC_Accu += SVC_Accuracy save_classifier = open("GMO_Hanzhe/LinearSVC_classifier10k.pickle","wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) for idx in testcv: testFeatures.append(training[idx]) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = LogisticRegression_classifier.classify(features) testSets[predicted].add(i) #7/5/2015 ## pos_Precision += (nltk.metrics.precision(referenceSets["pos"], testSets["pos"]))*100 ## pos_recall += (nltk.metrics.recall(referenceSets["pos"], testSets["pos"]))*100 ## neg_Precision += (nltk.metrics.precision(referenceSets["neg"], testSets["neg"]))*100 ## neg_recall += (nltk.metrics.recall(referenceSets["neg"], testSets["neg"]))*100 ## ## precision["pos"] = nltk.metrics.precision(referenceSets["pos"], testSets["pos"]) ## recall["pos"] = nltk.metrics.recall(referenceSets["pos"], testSets["pos"]) ## precision["neg"] = nltk.metrics.precision(referenceSets["neg"], testSets["neg"]) ## recall["neg"] = nltk.metrics.recall(referenceSets["neg"], testSets["neg"]) ## ## save_classifier = open("GMOHedging/BasedNaiveClassifier.pickle","wb") ## pickle.dump(BasedNaiveClassifier, save_classifier) ## save_classifier.close() ### average_precision["pos"] = precision["pos"] #get Average score for Accuracy, Precision and Recall accu = Naive_Accu/num_folds #7/5/2015 ## pos_Precision = pos_Precision/num_folds ## pos_recall = pos_recall/num_folds ## neg_Precision = neg_Precision/num_folds ## neg_recall = neg_recall/num_folds print("Average Naive Bayes Accuracy is:", accu) #7/5/2015 ## print("Average LinearSVC_classifier Positive Precision is:", pos_Precision) ## print("Average LinearSVC_classifier Positive Recall is:", pos_recall) ## print("Average LinearSVC_classifier Negative Precision is:", neg_Precision) ## print("Average LinearSVC_classifier Negative Recall is:", neg_recall) Regression_Accu = Regression_Accu/num_folds print("LogisticRegression_classifier accuracy percent:", Regression_Accu) SVC_Accu = SVC_Accu/num_folds print("LinearSVC_classifier accuracy percent:", SVC_Accu)
sa = SentimentAnalyzer(validation_sample) validation_set = sa.bow ground_truth = [r[1] for r in validation_set] #print("Training MultinomialNB") #MNB_clf = SklearnClassifier(MultinomialNB()) #MNB_clf.train(training_set) #MNB_pred = [MNB_clf.classify(r[0]) for r in validation_set] ##for i in range(len(MNB_pred)): ## if MNB_pred[i] == 5: ## print(validation_sample.review_body[i]) ## print(validation_reviews[i],"\n+++++++++++++++++++++++++++++++") #print("Got F1 score of", precision_score(ground_truth, MNB_pred, average='micro')) # #print("Training BernoulliNB") #BNB_clf = SklearnClassifier(BernoulliNB()) #BNB_clf.train(training_set) #BNB_pred = [BNB_clf.classify(r[0]) for r in validation_set] #print("Got F1 score of", precision_score(ground_truth, BNB_pred, average='micro')) print("Training LogisticRegression") LogReg_clf = SklearnClassifier(LogisticRegression()) LogReg_clf.train(training_set) LogReg_pred = [LogReg_clf.classify(r[0]) for r in validation_set] print("Got F1 score of", precision_score(ground_truth, LogReg_pred, average='micro')) #print("Training SGD") #SGD_clf = SklearnClassifier(SGDClassifier()) #SGD_clf.train(training_set) #SGD_pred = [SGD_clf.classify(r[0]) for r in validation_set] #print("Got F1 score of", precision_score(ground_truth, SGD_pred, average='micro'))#
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classif = SklearnClassifier(LinearSVC()) classif.train(trainfeats) print classif.labels() test_skl = [] t_test_skl = [] for d in testfeats: test_skl.append(d[0]) t_test_skl.append(d[1]) print(set(t_test_skl)) result = [] for item in test_skl: p = classif.classify(item) result.append(p) print len(result) print len(t_test_skl) score = 0.0 for i in range(0,len(result)): if result[i] == t_test_skl[i]: score = score + 1.0 print score/len(result) from sklearn.metrics import classification_report # getting a full report print classification_report(t_test_skl, result, labels=list(set(t_test_skl)),target_names=cls_set)