def train_classifier(): """ Train a MaxEnt classifier and return it """ labeled_featuresets = [(MEMM_features(word, tag, previous_tag), tag) for (word, tag, previous_tag) in labeled_features] maxent_classifier = MaxentClassifier.train(labeled_featuresets, max_iter=50) return maxent_classifier
def evaluate_features(trainFeatures, testFeatures): """ Train and evaluate the classifier model. """ classifier = MaxentClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print '**************************************' print 'pos precision:', precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', recall(referenceSets['pos'], testSets['pos']) print 'pos fmeasure:', f_measure(referenceSets['pos'], testSets['pos']) print '**************************************' print 'neg precision:', precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', recall(referenceSets['neg'], testSets['neg']) print 'neg fmeasure:', f_measure(referenceSets['neg'], testSets['neg']) classifier.show_most_informative_features(50)
def IIS(num_folds, featuresets, label_list): subset_size = int(len(featuresets) / num_folds) # overall gold labels for each instance (reference) and predicted labels (test) reflist = [] testlist = [] accuracy_list = [] print("IIS Classifier") # iterate over the folds for i in range(num_folds): print('Start Fold', i) test_this_round = featuresets[i * subset_size:][:subset_size] train_this_round = featuresets[:i * subset_size] + featuresets[ (i + 1) * subset_size:] # train using train_this_round classifier = MaxentClassifier.train(train_this_round, 'IIS', max_iter=1) # evaluate against test_this_round and save accuracy accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round) print(i, accuracy_this_round) accuracy_list.append(accuracy_this_round) # add the gold labels and predicted labels for this round to the overall lists for (features, label) in test_this_round: reflist.append(label) testlist.append(classifier.classify(features)) print('Done with cross-validation') # call the evaluation measures function print('mean accuracy-', sum(accuracy_list) / num_folds) (precision_list, recall_list) = eval_measures(reflist, testlist, label_list) print_evaluation(precision_list, recall_list, label_list) print(" ")
def train(self, file_train): """ :param file_train: :return: """ self.states = set([]) # me classifier labeled_featuresets = [] # list of (feature_dict, lable) iter = PreprocessUtil.file_iter(file_train) sent = iter.__next__() while sent: prev_state = self.zero_state for tokconll in sent: obs, _, state = tokconll.strip().split("\t") self.states.add(state) feature_dict = {"prev_state": prev_state, "obs": obs} labeled_featuresets.append((feature_dict, state)) prev_state = state sent = iter.__next__() self.classifier = MaxentClassifier.train(labeled_featuresets, max_iter=self.max_iter)
def treina_classificadores(): posdados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_1.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: posdados.append(val[0]) negdados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_0.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: negdados.append(val[0]) neudados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_2.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: neudados.append(val[0]) negfeats = [(bag_of_words(f), 'neg') for f in divide(negdados)] posfeats = [(bag_of_words(f), 'pos') for f in divide(posdados)] neufeats = [(bag_of_words(f), 'neu') for f in divide(neudados)] treino = negfeats + posfeats + neufeats #'Maximum Entropy' classificadorME = MaxentClassifier.train(treino, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) #SVM classificadorSVM = SklearnClassifier(LinearSVC(), sparse=False) classificadorSVM.train(treino) # Naive Bayes classificadorNB = NaiveBayesClassifier.train(treino) return ([classificadorME, classificadorSVM, classificadorNB])
def searchMaxentClassifier(title, train_departments): """ :param title: :param train_departments: :return: """ timeTraning = time.time() classifier = MaxentClassifier.train(train_departments, max_iter=5) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def ddi(train_inputdir, devel_inputdir, outputfile): # ddi was changed: previous code was moved to "feature extractor" to isolate # the proceess from the learner and the classifier. Vectors are a list of 2 values: the string with the sentence_id # and the ids of the entity pair, which are needed for the evaluator, and a second value with the featuresets # required by MaxentClassifier training_vector = feature_extractor(train_inputdir, True) test_vector = feature_extractor(devel_inputdir, False) featuresets = [] for featureset in training_vector: featuresets = featuresets + featureset[1] classifier = MaxentClassifier.train(featuresets, algorithm="iis", max_iter=50) file = open(outputfile, "w") # The classifier is called for each featureset, in order to assign for featureset in test_vector: result = classifier.classify(featureset[1]) if result == "null": file.write(featureset[0] + '|0|' + result + '\n') else: file.write(featureset[0] + '|1|' + result + '\n') evaluate(DEVEL_INPUT_DIR, outputfile)
def __maxent_train(fs): return MaxentClassifier.train(fs, algorithm=algorithm, gaussian_prior_sigma=gaussian_prior_sigma, count_cutoff=count_cutoff, min_lldelta=min_lldelta, trace=trace)
def ME_gender(train_set, test_set): print('== NLTK MaxEnt ==') from nltk.classify import MaxentClassifier nltk_classifier = MaxentClassifier.train( train_set, nltk.classify.MaxentClassifier.ALGORITHMS[0]) print(nltk_classifier.prob_classify(gender_features('mark'))._prob_dict) print(nltk.classify.accuracy(nltk_classifier, test_set))
def _train(self, algo='iis', trace=0, max_iter=10): ''' Internal method to train and return a NLTK maxent classifier. ''' data = [(p.text, p.quote) for p in train_query] train_set = [(get_features(n), g) for (n, g) in data] return MaxentClassifier.train(train_set, algorithm=algo, trace=trace, max_iter=max_iter)
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) count_table = {'+':0, '-':0, 'I':0, 'O':0} tweets = classify.get_tweets_to_classify(conn_analysis); for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = classifier.classify(classify.process_tweet(text)) update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print count_table
def main(): # grab xml trees train_filename = sys.argv[1] test_filename = sys.argv[2] train_tree = et.parse(train_filename) test_tree = et.parse(test_filename) train_root = train_tree.getroot() test_root = test_tree.getroot() # labeled reviews train_labels = [get_label(review, 'train') for review in train_root] test_labels = [get_label(review, 'test') for review in train_root] top_adjs_and_advs = get_top_adjs_n_advs(train_labels, 2) # randomize random.shuffle(train_labels) random.shuffle(test_labels) # feature sets train_set = [(get_features(review, 'train', top_adjs_and_advs), label) for review, label in train_labels] test_set = [(get_features(review, 'test', top_adjs_and_advs), review[ASIN].text) for review in test_labels] # train classifier classifier = MaxentClassifier.train(train_set, trace=0) # print results print_results(classifier, test_set)
def train(cls): train_set = cls.get_final_train_set() classifier = maxent.train(train_set, cls.MAXENT_ALGORITHM, trace=0, max_iter=1000) # save classifier f = open(cls.CLASSIFIER_FILE, 'wb') pickle.dump(classifier, f) f.close()
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} tweets = classify.get_tweets_to_classify(conn_analysis) for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = classifier.classify(classify.process_tweet(text)) update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print count_table
def train(self, d): """ Given a labeled set, train our classifier. """ t = self.__tag_data_set(d) self.classifier = MaxentClassifier.train(t) logging.info("Training on %s records complete." % len(d))
def me_classifier(exclude_list): me_classifier = 0 with open(train_data, 'r', encoding='utf-8', errors='ignore') as csvfile: reader = csv.reader(csvfile) feature_set = [(feature_set_generator(text, length, label, exclude_list), label) for text, length, label in reader] #print(feature_set) me_classifier = MaxentClassifier.train(feature_set, "megam") accuracy = 0.0 with open(test_data, 'r', encoding='utf-8', errors='ignore') as testcsvfile: test_reader = csv.reader(testcsvfile) test_feature_set = [(feature_set_generator(text, length, label, exclude_list), label) for text, length, label in test_reader] accuracy = classify.accuracy(me_classifier, test_feature_set) classified = collections.defaultdict(set) observed = collections.defaultdict(set) i = 1 with open(test_data, 'r', encoding='utf-8', errors='ignore') as testcsvfile: test_reader = csv.reader(testcsvfile) for text, length, label in test_reader: observed[label].add(i) classified[me_classifier.classify( feature_set_generator(text, length, label, exclude_list))].add(i) i += 1 return accuracy,precision(observed["1"], classified["1"]),recall(observed['1'], classified['1']),\ f_measure(observed['1'], classified['1']),precision(observed['0'], classified['0']),recall(observed['1'], classified['0']),f_measure(observed['1'], classified['0'])
def main_function(): conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], user=DATABASES['date_cutoff']['USER'], passwd=DATABASES['date_cutoff']['PASSWORD'], db=DATABASES['date_cutoff']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) error_dict = {'+':0, '-':0, 'I':0, 'O':0} count_dict = {'+':0, '-':0, 'I':0, 'O':0} guess_dict = {'+':0, '-':0, 'I':0, 'O':0} full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, '-':{'+':0, '-':0, 'I':0, 'O':0}, 'I':{'+':0, '-':0, 'I':0, 'O':0}, 'O':{'+':0, '-':0, 'I':0, 'O':0}} test_tweets = classify.get_test_tweets(conn_analysis) test_feature_set = process_tweets(test_tweets) classifier.show_most_informative_features(10) classifier_accuracy = accuracy(classifier, test_feature_set) print "classifier accuracy: " + repr(classifier_accuracy)
def __maxent_train(fs): return MaxentClassifier.train( fs, algorithm=algorithm, gaussian_prior_sigma=gaussian_prior_sigma, count_cutoff=count_cutoff, min_lldelta=min_lldelta, trace=trace)
def train(cls, aspect): print cls.get_features(aspect) print cls.get_classifier_name(aspect) train_set = cls.get_features(aspect)[:int(0.7*cls.LABELED_NUM)] classifier = maxent.train(train_set, 'IIS', trace=0, max_iter=1000) # save classifier f = open(cls.get_classifier_name(aspect), 'wb') pickle.dump(classifier, f) f.close()
def evaluate_classifier(featureX): negIds = app_reviews.fileids('neg') posIds = app_reviews.fileids('pos') posFeatures = [(featureX(app_reviews.words(fileids=[f])), 'neg') for f in negIds] negFeatures = [(featureX(app_reviews.words(fileids=[f])), 'pos') for f in posIds] #selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = negFeatures[:negCutoff] + posFeatures[:posCutoff] testFeatures = negFeatures[negCutoff:] + posFeatures[posCutoff:] #trains a Naive Bayes Classifier NBclassifier = NaiveBayesClassifier.train(trainFeatures) #trains a Maximum Entropy or Logistic Regression Classifier MEclassifier = MaxentClassifier.train(trainFeatures, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5) #trains a DecisionTree Classifier DTclassifier = DecisionTreeClassifier.train(trainFeatures, binary=True, entropy_cutoff=0.5, depth_cutoff=70, support_cutoff=10) #Combining Classifiers with Voting classifier = MaxVoteClassifier(NBclassifier, MEclassifier, DTclassifier) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) observed = classifier.classify(features) testSets[observed].add(i) #prints metrics to show how well the feature selection print('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))) print 'Accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos Precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos']) print 'pos Recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos']) print 'neg Precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg']) print 'neg Recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
def classify_maxent(X_train, Y_train, X_test): training_input = X_train training_output = Y_train training_data = [] for i in range(len(training_input)): training_data.append((training_input[i], training_output[i])) clf = MaxentClassifier.train(training_data) pred_labels = clf.classify_many(X_test) return pred_labels
def maxenscore(trainset, testset): me_classifier = MaxentClassifier.train(trainset, algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5) # (test, tag_test) = zip(*testset) # pred = me_classifier.classify(test) return nltk.classify.accuracy(me_classifier, testset)
def main(): negfeats = [] posfeats = [] for i, f in enumerate(reviews[0]): print(f) if reviews[1][i] == 0: negfeats.append((word_feats(f.split()), "neg")) else: posfeats.append((word_feats(f.split()), "pos")) testNegfeats = [] testPosfeats = [] for i, f in enumerate(test[0]): if test[1][i] == 0: testNegfeats.append((word_feats(f.split()), "neg")) else: testPosfeats.append((word_feats(f.split()), "pos")) trainfeats = negfeats + posfeats testfeats = testNegfeats + testPosfeats print('train on %d instances, test on %d instances - Maximum Entropy' % (len(trainfeats), len(testfeats))) classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['pos'], testsets['pos']) pos_recall = recall(refsets['pos'], testsets['pos']) pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) neg_precision = precision(refsets['neg'], testsets['neg']) neg_recall = recall(refsets['neg'], testsets['neg']) neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) print(pos_recall) print(neg_recall) print() print('') print('---------------------------------------') print(' Maximum Entropy ') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
def train(cls, aspect): print cls.get_features(aspect) print cls.get_classifier_name(aspect) train_set = cls.get_features(aspect)[:int(0.7 * cls.LABELED_NUM)] classifier = maxent.train(train_set, 'IIS', trace=0, max_iter=1000) # save classifier f = open(cls.get_classifier_name(aspect), 'wb') pickle.dump(classifier, f) f.close()
def maxent_train (self): self.classifier_all = MaxentClassifier.train (self.maxent_memes_all, trace=100, max_iter=5) #classifier_bottom = MaxentClassifier.train (maxent_memes_bottom, trace=100, max_iter=250) #classifier_all = MaxentClassifier.train (maxent_memes_all, trace=100, max_iter=250) weights = self.classifier_all.weights() f = open ("lambdas.txt", "w") for weight in weights: f.write("weight = %f" % weight) f.write ("\n")
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+':0, '-':0, 'I':0, 'O':0} test_tweets = classify.get_test_tweets(conn) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+':0, '-':0, 'I':0, 'O':0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_tweet_polarity_ensemble(tweet[0], guess, conn) count_table[guess] += 1 print "Maximum Entropy" print count_table #generate the accuracy matrix full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, '-':{'+':0, '-':0, 'I':0, 'O':0}, 'I':{'+':0, '-':0, 'I':0, 'O':0}, 'O':{'+':0, '-':0, 'I':0, 'O':0}} for tweet in test_tweets: result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0]) guess = result[0][0] actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0]) actual = actual_result[0][0] if guess is not None: if actual is not None: full_matrix[actual][guess] += 1 print full_matrix
def maxent_train(feature_list): labeled_features = [] for (word, tag, shape, label, prev_word, prev_tag, prev_shape, previous_label) in feature_list: labeled_features.append( (feature_template(word, tag, shape, prev_word, prev_tag, prev_shape, previous_label), label)) f = open("../DataFiles/ner_labeler.pickle", "wb") maxent_classifier = MaxentClassifier.train(labeled_features, max_iter=40) pickle.dump(maxent_classifier, f) f.close()
def train_maxent_classifier(labelled_features): train_set = [] for lf in labelled_features: train_set.append((Generate_MEMM_features(lf), lf[0])) print("\nTraining Maxent Classifier on train.txt.") maxent_classifier = MaxentClassifier.train(train_set, max_iter=15) return maxent_classifier
def train_classifier(train_data, labels): data = [] for i in range(len(train_data)): data.append((train_data[i], labels[i])) print('starting') classifier = MaxentClassifier.train(data, algorithm='GIS', trace=0, max_iter=6) print('done') return classifier
def axentClassifier(features_train, features_test): print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test)) classifier = MaxentClassifier.train(features_train,algorithm='gis') print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test) precisions, recalls = precision_recall(classifier, features_test) print "accuracy: ", precisions, "fitness: ", recalls # def sklearnMultinomialNB(features_train, features_test): # print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test)) # classifier = SklearnClassifier(MultinomialNB()) # classifier.train # print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
def run(training): """ To create and train a MaxentClassifier :return: a trained Classifier """ print "Training ME Classifier..." # feats = label_feat_from_corps(movie_reviews) # training, testing = split_label_feats(feats) me_classifier = MaxentClassifier.train(training, algorithm='GIS', trace=0, max_iter=10, min_lldelta=0.5) print "ME Classifier trained..." return save_classifier(me_classifier)
def test_maxent(algorithm): print('%11s' % algorithm) try: classifier = MaxentClassifier.train( train, algorithm, trace=0, max_iter=1000) except Exception as e: print('Error: %r' % e) return for featureset in test: pdist = classifier.prob_classify(featureset) print('%8.15f %6.15f' % (pdist.prob('x'), pdist.prob('y'))) print()
def trainMaxent(featuresets): #idx = 2*len(featuresets) / ratio #train_set, test_set = featuresets[idx:], featuresets[:idx] train_set = featuresets algo = MaxentClassifier.ALGORITHMS[1] #max_iter=20 classifier = MaxentClassifier.train(train_set, algo, max_iter=3) #print accuracy(classifier, test_set) classifier.show_most_informative_features(100) #train_set, test_set = featuresets[idx:], featuresets[:idx] #classifier.train(train_set, algo, max_iter=20) #print accuracy(classifier, test_set) #classifier.show_most_informative_features(100) return classifier
def train(self): self.clf_one_step = MaxentClassifier.train(self.train_one_step, 'megam', trace=0, max_iter=1000) self.clfs_two_step = { 2: MaxentClassifier.train(self.train_two_step[2], 'megam', trace=0, max_iter=1000), 3: MaxentClassifier.train(self.train_two_step[3], 'megam', trace=0, max_iter=1000), 4: MaxentClassifier.train(self.train_two_step[4], 'megam', trace=0, max_iter=1000), }
def train( cls, docs: Collection[Document], stopwords: Optional[Collection[Word]] = None, algorithm: str = 'iis', cutoff: int = 4, sigma: float = 0., trim_length: int = 10, ) -> 'MaxentSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. stopwords (Collection[Word]): Collection of stopwords. algorithm (str): Optimization algorithm for training. Possible values are 'iis', 'gis', or 'megam' (requires `megam`_ to be installed). cutoff (int): Features that occur fewer than this value in the training data will be discarded. sigma (float): Standard deviation for the Gaussian prior. Default is no prior. trim_length (int): Trim words to this length. Returns: MaxEntropy: The trained model. .. _megam: https://www.umiacs.umd.edu/~hal/megam/ """ if stopwords is None: stopwords = set() word_pairs = { pair for doc in docs for sent in doc.sentences for pair in cls._get_word_pairs(sent, stopwords, trim_len=trim_length) } train_data: list = [] for doc in docs: featuresets = cls._extract_featuresets(doc, stopwords, word_pairs, trim_length) labels = [sent.label for sent in doc.sentences] train_data.extend(zip(featuresets, labels)) encoding = BinaryMaxentFeatureEncoding.train(train_data, count_cutoff=cutoff, alwayson_features=True) classifier = MaxentClassifier.train(train_data, algorithm=algorithm, encoding=encoding, gaussian_prior_sigma=sigma) return cls(classifier, stopwords=stopwords, word_pairs=word_pairs)
def trainMaxent10(featuresets): #idx = 2*len(featuresets) / ratio #train_set, test_set = featuresets[idx:], featuresets[:idx] train_set = featuresets algo = MaxentClassifier.ALGORITHMS[1] #max_iter=20 classifier = MaxentClassifier.train(train_set, algo, max_iter=10) #print accuracy(classifier, test_set) classifier.show_most_informative_features(100) #train_set, test_set = featuresets[idx:], featuresets[:idx] #classifier.train(train_set, algo, max_iter=20) #print accuracy(classifier, test_set) #classifier.show_most_informative_features(100) return classifier
def __init__(self, ftrain, fdev, ftest): self.train = json.load(open(ftrain)) self.dev = json.load(open(fdev)) # self.test = json.load(open(ftest)) step1_train_features, step2_train_features = self.extract_features( self.train) step1_dev_features, step2_dev_features = self.extract_features( self.dev) # step1_test_features, step2_test_features = self.extract_features(self.test) p.dump(step1_train_features, open('data/step1_train_features.cPickle', 'w')) p.dump(step2_train_features, open('data/step2_train_features.cPickle', 'w')) p.dump(step1_dev_features, open('data/step1_dev_features.cPickle', 'w')) p.dump(step2_dev_features, open('data/step2_dev_features.cPickle', 'w')) clf_step1 = MaxentClassifier.train(step1_train_features, 'megam', trace=0, max_iter=1000) # clf = nltk.NaiveBayesClassifier.train(trainset) p.dump(clf_step1, open('data/clf_step1.cPickle', 'w')) print 'Accuracy: ', accuracy(clf_step1, step1_dev_features) clf_step2 = MaxentClassifier.train(step2_train_features, 'megam', trace=0, max_iter=1000) # clf = nltk.NaiveBayesClassifier.train(trainset) p.dump(clf_step2, open('data/clf_step2.cPickle', 'w')) print 'Accuracy: ', accuracy(clf_step2, step2_dev_features)
def __init__(self): try: classifier = None if not os.path.exists(classifier_path): '''with open('nltk_sentiment_data/polarity_pos.txt', 'rb') as fp: pos_lines = fp.readlines() pos_feats = [(word_feats(tokenizer.tokenize(p_line)), '1') for p_line in pos_lines] with open ('nltk_sentiment_data/polarity_neg.txt', 'rb') as fn: neg_lines = fn.readlines() neg_feats = [(word_feats(tokenizer.tokenize(n_line)), '0') for n_line in neg_lines]''' filename = os.path.dirname( os.path.abspath(__file__) ) + "/nltk_sentiment_data/sentiment_data_twitter.txt" with open(filename, 'rb') as fp: lines = fp.readlines() feats = [(word_feats( tokenizer.tokenize( line.split(' -> ')[1].strip().lower())), line.split(' -> ')[0]) for line in lines if len(line.split(' -> ')) >= 2] print "Total : %s" % (len(feats), ) cutoff = int(len(feats) * 0.1) trainfeats, testfeats = feats[cutoff:], feats[:cutoff] '''cutoff = int(len(pos_feats) * 0.1) trainfeats = pos_feats[cutoff:] + neg_feats[cutoff:] testfeats = pos_feats[:cutoff] + neg_feats[:cutoff]''' print 'train on %d instances, test on %d instances' % ( len(trainfeats), len(testfeats)) #classifier = NaiveBayesClassifier.train(trainfeats) classifier = MaxentClassifier.train(trainfeats, algorithm='iis', trace=0, max_iter=10) print 'accuracy:', nltk.classify.util.accuracy( classifier, testfeats) classifier.show_most_informative_features() with open(classifier_path, "w") as fh: cPickle.dump(classifier, fh, 1) else: with open(classifier_path, "r") as fh: classifier = cPickle.load(fh) self.classifier = classifier logger.info("Initialized SentimentClassifier instance..") except Exception, e: logger.exception(e) raise e
def train(cls, training_sequence, **kwargs): feature_detector = kwargs.get('feature_detector') gaussian_prior_sigma = kwargs.get('gaussian_prior_sigma', 10) count_cutoff = kwargs.get('count_cutoff', 1) stopping_condition = kwargs.get('stopping_condition', 1e-7) def __featurize(tagged_token): tag = tagged_token[-1] feats = feature_detector(tagged_token) return (feats, tag) labeled_featuresets = LazyMap(__featurize, training_sequence) classifier = MaxentClassifier.train(labeled_featuresets, algorithm='megam', gaussian_prior_sigma=gaussian_prior_sigma, count_cutoff=count_cutoff, min_lldelta=stopping_condition) return cls(classifier._encoding, classifier.weights())
def trainCorpus(): if os.path.exists(classifier_fname): return LoadClassifier() else: c = getDealsCorpus() hiwords = corpus_high_info_words(c) featdet = lambda words: bag_of_words_in_set(words, hiwords) train_feats, test_feats = corpus_train_test_feats(c, featdet) trainf = lambda train_feats: MaxentClassifier.train(train_feats, algorithm='megam', trace=0, max_iter=10) labelset = set(c.categories()) classifiers = train_binary_classifiers(trainf, train_feats, labelset) multi_classifier = MultiBinaryClassifier(*classifiers.items()) multi_p, multi_r, avg_md = multi_metrics(multi_classifier, test_feats) print multi_p['activitiesevents'], multi_r['activitiesevents'], avg_md SaveClassifier(multi_classifier) return multi_classifier
def train(self, featureset=None): """ Trains the maximum entropy classifier and returns it. If a featureset is specified it trains on that, otherwise it trains on the models featureset. Pass in a featureset during cross validation. Returns the training time and the classifier. """ featureset = featureset or self.featureset() # Time how long it takes to train start = time.time() classifier = MaxentClassifier.train(featureset, algorithm='megam', trace=1, gaussian_prior_sigma=1) delta = time.time() - start return classifier, delta
def parse(): tagger_classes=([nltk.UnigramTagger, nltk.BigramTagger]) trained_sents, tagged_sents = trainer("WSJ_02-21.pos-chunk","WSJ_23.pos") #tagger = nltk.UnigramTagger(trained_sents) print len(trained_sents) tagger = ClassifierBasedPOSTagger(train=trained_sents[:10000], classifier_builder=lambda train_feats: MaxentClassifier.train(train_feats, trace = 0,max_iter=10)) f = open("WSJ_23.chunk",'w') #print sents for sents in tagged_sents: (words,tags)=sents[0],sents[1] chunks = tagger.tag(tags) #print words, chunks wtc = zip(words, chunks) for tup in wtc: f.write("%s\t%s\n" %(tup[0],tup[1][1])) f.write("\n")
def main_function(): conn = MySQLdb.connect( host=DATABASES["date_cutoff"]["HOST"], user=DATABASES["date_cutoff"]["USER"], passwd=DATABASES["date_cutoff"]["PASSWORD"], db=DATABASES["date_cutoff"]["NAME"], ) training_tweets = get_test_tweets(conn) # training_feature_set = process_tweets(training_tweets) total_word_count = total_words(conn) training_feature_set = process_bigrams(conn, "+", total_word_count, best_words) training_feature_set += process_bigrams(conn, "-", total_word_count, best_words) training_feature_set += process_bigrams(conn, "I", total_word_count, best_words) training_feature_set += process_bigrams(conn, "O", total_word_count, best_words) print "configuring megam" config_megam("/opt/packages") print "starting training" classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) print "starting end training" classifier.show_most_informative_features(40) test_tweets = get_training_tweets(conn) test_feature_set = process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, # '-':{'+':0, '-':0, 'I':0, 'O':0}, # 'I':{'+':0, '-':0, 'I':0, 'O':0}, # 'O':{'+':0, '-':0, 'I':0, 'O':0}} # for f in test_tweets: # guess = classifier.classify(process_tweet(f[1])) # full_matrix[f[2]][guess] += 1 # print full_matrix print "classifier accuracy: " + repr(classifier_accuracy)
def __init__(self): try: classifier = None if not os.path.exists(classifier_path): '''with open('nltk_sentiment_data/polarity_pos.txt', 'rb') as fp: pos_lines = fp.readlines() pos_feats = [(word_feats(tokenizer.tokenize(p_line)), '1') for p_line in pos_lines] with open ('nltk_sentiment_data/polarity_neg.txt', 'rb') as fn: neg_lines = fn.readlines() neg_feats = [(word_feats(tokenizer.tokenize(n_line)), '0') for n_line in neg_lines]''' filename = os.path.dirname(os.path.abspath(__file__)) + "/nltk_sentiment_data/sentiment_data_twitter.txt" with open(filename, 'rb') as fp: lines = fp.readlines() feats =[(word_feats(tokenizer.tokenize(line.split(' -> ')[1].strip().lower())), line.split(' -> ')[0]) for line in lines if len(line.split(' -> ')) >=2] print "Total : %s" %(len(feats),) cutoff = int(len(feats)*0.1) trainfeats, testfeats = feats[cutoff:], feats[:cutoff] '''cutoff = int(len(pos_feats) * 0.1) trainfeats = pos_feats[cutoff:] + neg_feats[cutoff:] testfeats = pos_feats[:cutoff] + neg_feats[:cutoff]''' print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) #classifier = NaiveBayesClassifier.train(trainfeats) classifier = MaxentClassifier.train(trainfeats, algorithm='iis', trace=0, max_iter=10) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() with open(classifier_path, "w") as fh: cPickle.dump(classifier, fh, 1) else: with open(classifier_path, "r") as fh: classifier = cPickle.load(fh) self.classifier = classifier logger.info("Initialized SentimentClassifier instance..") except Exception, e: logger.exception(e) raise e
def _train_mode_for_user(self, userid): if userid in self._user_classifier: print("Already exist!!!") self._user_classifier[userid] = None # 只需要好评和差评的,一般评论不做参考,所以INNER JOIN足够 sql = """ SELECT site_news.news_uuid, user_score.news_user_score as news_score FROM site_news INNER JOIN user_score ON site_news.news_uuid = user_score.newsid WHERE DATE(site_news.time) < CURRENT_DATE() AND DATE(site_news.time) > DATE_SUB(CURRENT_DATE(),INTERVAL 5 DAY) AND user_score.news_user_score != 1 AND user_score.userid=%d; """ %(userid) train_items = self.db_conn.query(sql); print("建立POS/NEG特征") pos_feature = [] neg_feature = [] for item in train_items: news_vector = nlp_master.get_old_vect(item['news_uuid']); if item['news_score'] == 0: #好评 pos_feature.append((self.best_word_features(news_vector, news_vector),'pos')) elif item['news_score'] == 2: #差评 neg_feature.append((self.best_word_features(news_vector, news_vector),'neg')) print("POS:%d, NEG:%d" %(len(pos_feature),len(neg_feature))) if len(pos_feature) <= 3 or len(neg_feature) <=3: print("特征太少,放弃。。。") self._user_classifier[userid] = None return trainSet = pos_feature + neg_feature self._user_classifier[userid] = MaxentClassifier.train(trainSet, max_iter=50) print("MaxEnt Classifier for %d build done!"%(userid)) # 保存更新结果 today = datetime.date.today() self.dumpfile = "dumpdir/recmaxent_dump.%d_%d" %(today.month, today.day) with open(self.dumpfile,'wb', -1) as fp: dump_data = [] dump_data.append(self._user_classifier) pickle.dump(dump_data, fp, -1) return
def evaluate_classifier(featx,collocationFunc): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids] lenNegFeats=min(len(negfeats),400) lenPosFeats=min(len(posfeats),400) # lenNegFeats=len(negfeats) # lenPosFeats=len(posfeats) negcutoff = int(lenNegFeats*3/4) poscutoff = int(lenPosFeats*3/4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats] classifier = MaxentClassifier.train(trainfeats,algorithm='IIS',max_iter=3) print(classifier) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) print(classifier) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) evaluationMetrics={} classifier.show_most_informative_features() evaluationMetrics['model']=classifier evaluationMetrics['trainingData']=trainfeats evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats) evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos']) evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos']) evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos']) evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg']) evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg']) evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg']) return evaluationMetrics
def main_function(): conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart") training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) tweets = classify.get_tweets_to_classify(conn_analysis); bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+':0, '-':0, 'I':0, 'O':0} for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+':0, '-':0, 'I':0, 'O':0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_max_ent_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print "Maximum Entropy" print count_table
def getClassifier(tweetfile): print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams" NGrammized = collectNGrams(categorized,degreesToUse) print "Compiling Results" readyToSend = [] for category in NGrammized.keys(): readyToSend += NGrammized[category] print "Attempting Classification" if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(readyToSend) elif classMode == 'max ent': from nltk.classify import MaxentClassifier classifier = MaxentClassifier.train(readyToSend) print classifier.show_most_informative_features(n=200) classifier.show_most_informative_features() return classifier
pos = [simplify_wsj_tag(tag) for word, tag in pos] words = [w.lower() for w in words] trigrams = nltk.trigrams(words) trigrams = ['%s/%s/%s' % (i[0], i[1], i[2]) for i in trigrams] features = words + pos + trigrams features = dict((f, True) for f in features) return features def train(): for k,v in training.items(): for sentence in v: yield (prepare_input(sentence), k) if __name__ == '__main__': import sys training_set = [] for features in train(): training_set.append(features) classifier = MaxentClassifier.train(training_set) layer_type, slug = sys.argv[1:3] r = requests.get('http://randomtaco.me/%s/%s/' % (layer_type, slug)) recipe = r.json()['recipe'] recipe_sentences = nltk.sent_tokenize(recipe) for sent in recipe_sentences: print classifier.classify(prepare_input(sent)) print '*'*80
puc = '-'.decode("utf-8") #some char is outof ASCII print (word) features['capitalization'] = word[0].isupper() features['start_of_sentence'] = word in wordStartList features['cap_start'] = word not in wordStartList and word[0].isupper() features['previous_NC'] = previous_BOI return features #*******************************************************************train the model labeled_featuresets = [(MEMM_features(word, tag, previous_BOI), boi )for (word, tag, boi, previous_BOI) in labeled_features] train_set = labeled_featuresets f = open("my_classifier.pickle", "wb") maxent_classifier = MaxentClassifier.train(train_set, max_iter=30) pickle.dump(maxent_classifier , f) f.close() #********************************************************************Viterbi def MEMM(wordList,tagList): BOI_list = ['B-NP', 'I-NP', 'O'] w1 = wordList[0] #the first word of the sentence t1 = tagList[0] tRange = len(BOI_list) wRange = len(wordList) viterbi = [[0 for x in range(300)] for x in range(300)] backpointer = [['' for x in range(300)] for x in range(300)] #intialization for t in range(tRange):#t = 0,1,2
def _classifier_builder(self, train): return MaxentClassifier.train(train, algorithm='megam', gaussian_prior_sigma=1, trace=2)
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'Reading Tweets\n' tweets_data_path = '20161019_202620.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue tweets = pd.DataFrame() tweets['text'] = [tweet.get('text','') for tweet in tweets_data] tdata = tweets['text'] negfeats = [(featx(f), 'neg') for f in word_split(tdata)] testfeats = negfeats print np.shape(testfeats) #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] #print np.shape(testfeats) # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print testsets[observed] accuracy = nltk.classify.util.accuracy(classifier, testfeats) #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos']) #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos']) #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos']) #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg']) #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg']) #neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg']) print '' print '---------------------------------------' print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', accuracy
def evaluate_classifier(featx, number_of_features, remove_stopwords): print "Adding features..." # create labeled bags of words (dictionary) neg_reviews = reviews[1] + reviews[2] pos_reviews = reviews[4] + reviews[5] random.shuffle(neg_reviews) random.shuffle(pos_reviews) neg_feats = [(featx(f, number_of_features), 'neg') for f in neg_reviews] pos_feats = [(featx(f, number_of_features), 'pos') for f in pos_reviews] neg_cutoff = len(neg_feats) * 3 / 4 pos_cutoff = len(pos_feats) * 3 / 4 trainfeats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff] testfeats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:] neg_sent_reviews = reviews_sents[1] + reviews_sents[2] pos_sent_reviews = reviews_sents[4] + reviews_sents[5] random.shuffle(neg_sent_reviews) random.shuffle(pos_sent_reviews) neg_sent_feats = [(sentences, 'neg') for sentences in neg_sent_reviews] pos_sent_feats = [(sentences, 'pos') for sentences in pos_sent_reviews] test_sent_feats = neg_sent_feats[neg_cutoff:] + pos_sent_feats[pos_cutoff:] classifierName = "Maximum Entropy (Features: Words" if remove_stopwords: classifierName += ", Removed Stopwords" if number_of_features > 1: classifierName += ", Stemmed Words" if number_of_features > 2: classifierName += ", Lemmatized Words" if number_of_features > 3: classifierName += ", Bigrams" classifierName += ")" print "Training..." classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter=1) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) sent_refsets = collections.defaultdict(set) sent_testsets = collections.defaultdict(set) print "Testing..." for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print "Testing on Sentences..." accurate_count = 0 for i, (sentences, label) in enumerate(test_sent_feats): sent_refsets[label].add(i) pos_prob_total = 0 neg_prob_total = 0 sentence_count = len(sentences) for sentence in sentences: pdist = classifier.prob_classify(featx(sentence, 4)) pos_prob_total += pdist.prob('pos') neg_prob_total += pdist.prob('neg') if ((pos_prob_total / sentence_count) > (neg_prob_total / sentence_count)): results = 'pos' else: results = 'neg' if (results == label): accurate_count += 1 sent_testsets[results].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) sent_accuracy = float(accurate_count) / len(test_sent_feats) neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg']) neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg']) neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg']) pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos']) pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos']) pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos']) sent_neg_precision = nltk.metrics.precision(sent_refsets['neg'], sent_testsets['neg']) sent_neg_recall = nltk.metrics.recall(sent_refsets['neg'], sent_testsets['neg']) sent_neg_fmeasure = nltk.metrics.f_measure(sent_refsets['neg'], sent_testsets['neg']) sent_pos_precision = nltk.metrics.precision(sent_refsets['pos'], sent_testsets['pos']) sent_pos_recall = nltk.metrics.recall(sent_refsets['pos'], sent_testsets['pos']) sent_pos_fmeasure = nltk.metrics.f_measure(sent_refsets['pos'], sent_testsets['pos']) print '' print '---------------------------------------' print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', accuracy print 'precision', (pos_precision + neg_precision) / 2 print 'recall', (pos_recall + neg_recall) / 2 print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2 print '' print '' print '---------------------------------------' print 'SENTENCES: SINGLE FOLD RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', sent_accuracy print 'precision', (sent_pos_precision + sent_neg_precision) / 2 print 'recall', (sent_pos_recall + sent_neg_recall) / 2 print 'f-measure', (sent_pos_fmeasure + sent_neg_fmeasure) / 2 print '' # CROSS VALIDATION trainfeats = neg_feats + pos_feats test_sent_feats = neg_sent_feats + pos_sent_feats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) random.shuffle(test_sent_feats) n = 5 # 5-fold cross-validation subset_size = len(trainfeats) / n accuracy = [] sent_accuracy = [] neg_precision = [] neg_recall = [] pos_precision = [] pos_recall = [] neg_fmeasure = [] pos_fmeasure = [] sent_neg_precision = [] sent_neg_recall = [] sent_pos_precision = [] sent_pos_recall = [] sent_neg_fmeasure = [] sent_pos_fmeasure = [] cv_count = 1 print 'Starting 5-fold cross validation...' for i in range(n): print "Fold " + str(i) + ":" testing_this_round = trainfeats[i * subset_size:][:subset_size] sent_testing_this_round = test_sent_feats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[(i + 1) * subset_size:] classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter=1) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) sent_refsets = collections.defaultdict(set) sent_testsets = collections.defaultdict(set) print "Testing..." for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print "Testing on Sentences..." accuracy_count = 0 for i, (sentences, label) in enumerate(sent_testing_this_round): sent_refsets[label].add(i) pos_prob_total = 0 neg_prob_total = 0 sentence_count = len(sentences) for sentence in sentences: pdist = classifier.prob_classify(featx(sentence, 4)) pos_prob_total += pdist.prob('pos') neg_prob_total += pdist.prob('neg') if ((pos_prob_total / sentence_count) > (neg_prob_total / sentence_count)): results = 'pos' else: results = 'neg' if (results == label): accuracy_count += 1 sent_testsets[results].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) sent_cv_accuracy = float(accuracy_count) / len(sent_testing_this_round) cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg']) cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg']) cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos']) cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos']) sent_cv_neg_precision = nltk.metrics.precision(sent_refsets['neg'], sent_testsets['neg']) sent_cv_neg_recall = nltk.metrics.recall(sent_refsets['neg'], sent_testsets['neg']) sent_cv_neg_fmeasure = nltk.metrics.f_measure(sent_refsets['neg'], sent_testsets['neg']) sent_cv_pos_precision = nltk.metrics.precision(sent_refsets['pos'], sent_testsets['pos']) sent_cv_pos_recall = nltk.metrics.recall(sent_refsets['pos'], sent_testsets['pos']) sent_cv_pos_fmeasure = nltk.metrics.f_measure(sent_refsets['pos'], sent_testsets['pos']) accuracy.append(cv_accuracy) sent_accuracy.append(sent_cv_accuracy) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) neg_fmeasure.append(cv_neg_fmeasure) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) pos_fmeasure.append(cv_pos_fmeasure) sent_neg_precision.append(sent_cv_neg_precision) sent_neg_recall.append(sent_cv_neg_recall) sent_neg_fmeasure.append(sent_cv_neg_fmeasure) sent_pos_precision.append(sent_cv_pos_precision) sent_pos_recall.append(sent_cv_pos_recall) sent_pos_fmeasure.append(sent_cv_pos_fmeasure) cv_count += 1 print '---------------------------------------' print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', sum(accuracy) / n print 'precision', (sum(neg_precision) / n + sum(pos_precision) / n) / 2 print 'recall', (sum(neg_recall) / n + sum(pos_recall) / n) / 2 print 'f-measure', (sum(neg_fmeasure) / n + sum(pos_fmeasure) / n) / 2 print '' print '---------------------------------------' print 'SENTENCES: N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', sum(sent_accuracy) / n print 'precision', (sum(sent_neg_precision) / n + sum(sent_pos_precision) / n) / 2 print 'recall', (sum(sent_neg_recall) / n + sum(sent_pos_recall) / n) / 2 print 'f-measure', (sum(sent_neg_fmeasure) / n + sum(sent_pos_fmeasure) / n) / 2 print ''
def uni_and_bi_validation(lines): """ + plots the classification F1-measure using bigrams and unigrams + prints a table containing the max accuracy and F1-measure obtained and the number of feature reached at :param lines: list of tweets :return: """ accuracy_list_nb = [] f_measure_list_nb = [] accuracy_list_svm = [] f_measure_list_svm = [] accuracy_list_maxent = [] f_measure_list_maxent = [] random.shuffle(lines) hashtag_list = PatternsFeatures().get_most_frequent_pattern(PatternsFeatures().pattern_classifier(lines, '#')) name_list = PatternsFeatures().get_most_frequent_pattern(PatternsFeatures().pattern_classifier(lines, '@')) train_set_rate = int(len(lines)*0.75) train_set, test_set = lines[:train_set_rate], lines[train_set_rate:] all_tweets = " ".join([" ".join(line[1]) for line in train_set]) ftr2 = FeatureExtraction(20) ftr2.most_frequent_bigrams(all_tweets) bigram_featuresets_test = [(ftr2.bigram_features(line[1]), line[0]) for line in test_set] bigram_featuresets_train = [(ftr2.bigram_features(line[1]), line[0]) for line in train_set] for i in range(10, 200, 20): ftr = FeatureExtraction(i) ftr.most_frequent_unigrams(all_tweets) for hashtag in hashtag_list: ftr.set_unigram_features_list(hashtag) for name in name_list: ftr.set_unigram_features_list(name) unigram_featuresets_test = [(ftr.unigram_features(line[1]), line[0]) for line in test_set] unigram_featuresets_train = [(ftr.unigram_features(line[1]), line[0]) for line in train_set] featuresets_test = bigram_featuresets_test + unigram_featuresets_test featuresets_train = bigram_featuresets_train + unigram_featuresets_train ############################################################################## classifier1 = NaiveBayesClassifier.train(featuresets_train) classifier2 = MaxentClassifier.train(featuresets_train) classifier3 = nltk.classify.SklearnClassifier(LinearSVC()) classifier3.train(featuresets_train) refsets = collections.defaultdict(set) testsets1 = collections.defaultdict(set) testsets2 = collections.defaultdict(set) testsets3 = collections.defaultdict(set) for i, (feats, label) in enumerate(featuresets_test): refsets[label].add(i) observed1 = classifier1.classify(feats) observed2 = classifier2.classify(feats) observed3 = classifier3.classify(feats) testsets1[observed1].add(i) testsets2[observed2].add(i) testsets3[observed3].add(i) accuracy_list_nb.append(nltk.classify.accuracy(classifier1, featuresets_test)) f_measure_list_nb.append(nltk.metrics.f_measure(refsets['not'], testsets1['not'])) accuracy_list_svm.append(nltk.classify.accuracy(classifier3, featuresets_test)) f_measure_list_svm.append(nltk.metrics.f_measure(refsets['not'], testsets3['not'])) accuracy_list_maxent.append(nltk.classify.accuracy(classifier2, featuresets_test)) f_measure_list_maxent.append(nltk.metrics.f_measure(refsets['not'], testsets2['not'])) ################################################################################ print "+-----------------------------------------------------------------+" print "\t\t\t\t\tbigram and unigram classification measurements" print "+-----------------------------------------------------------------+" print "\t\t\t\t\t\t\tmax accuracy \t number of features " print "Naive Bayes\t\t\t\t\t %f \t\t\t\t%d" % (max(accuracy_list_nb), (accuracy_list_nb.index(max(accuracy_list_nb))*20)+10) print "Maximum entropy\t\t\t\t %f \t\t\t\t%d" % (max(accuracy_list_maxent), (accuracy_list_maxent.index(max(accuracy_list_maxent))*20)+10) print "Support Vector Machine\t\t %f \t\t\t\t%d" % (max(accuracy_list_svm), (accuracy_list_svm.index(max(accuracy_list_svm))*20)+10) print "+-----------------------------------------------------------------+" print "+-----------------------------------------------------------------+" print "\t\t\t\t\t\t\tmax f-measure \t number of features " print "Naive Bayes\t\t\t\t\t %f \t\t\t\t%d" % (max(f_measure_list_nb), (f_measure_list_nb.index(max(f_measure_list_nb))*20)+10) print "Maximum entropy\t\t\t\t %f \t\t\t\t%d" % (max(f_measure_list_maxent), (f_measure_list_maxent.index(max(f_measure_list_maxent))*20)+10) print "Support Vector Machine\t\t %f \t\t\t\t%d" % (max(f_measure_list_svm), (f_measure_list_svm.index(max(f_measure_list_svm))+1)*20) print "+-----------------------------------------------------------------+" ################################################################################ print " time taken for the classification process %f sec " % (time() - t0) ##################################################################################################### x_axis = [i for i in range(10, 200, 20)] plt.figure(facecolor='white') fig1, = plt.plot(x_axis, accuracy_list_nb, 'r*-', label='Naive bayes accuracy') fig2, = plt.plot(x_axis, f_measure_list_nb, 'ro-', label='Naive bayes f-measure') fig3, = plt.plot(x_axis, accuracy_list_svm, 'g*-', label='SVM accuracy') fig4, = plt.plot(x_axis, f_measure_list_svm, 'go-', label='SVM f-measure') fig5, = plt.plot(x_axis, accuracy_list_maxent, '*-', label='max Entropy accuracy') fig6, = plt.plot(x_axis, f_measure_list_maxent, 'o-', label='max Entropy f-measure') plt.xlabel('Number of features') plt.ylabel('Results') plt.title('Results of the classification using unigrams and bigrams') plt.legend(handles=[fig1, fig2, fig3, fig4, fig5, fig6], loc=4) plt.show()
def word_features(words): return dict([(word, True) for word in words]) # Get all the movie reviews with positive data set and negative data set posRev = movie_reviews.fileids('pos') negRev = movie_reviews.fileids('neg') # Mark the words in data set as positive and negative: posWords = [(word_features(movie_reviews.words(fileids=[f])), 'pos') for f in posRev] negWords = [(word_features(movie_reviews.words(fileids=[f])), 'neg') for f in negRev] # Set cut off for separating the training data and the testing data: posCutoff = len(posWords) * 50 / 100 negCutoff = len(negWords) * 50 / 100 # Fill the training data and the testing data with positive and negative data set: TestRev = posWords[posCutoff:] + negWords[negCutoff:] Test_set = array(TestRev) TrainRev = posWords[:posCutoff] + negWords[:negCutoff] Train_set = array(TrainRev) print 'train on %d instances, test on %d instances' % (len(Train_set), len(Test_set)) # Call Maximum Entropy classifier to classify the training data: algo = MaxentClassifier.ALGORITHMS[0] classifier = MaxentClassifier.train(Train_set, algorithm=algo, max_iter=3) classifier.show_most_informative_features(10) # Print the algorithm accuracy print 'Accuracy is', util.accuracy(classifier, Test_set)