Example #1
0
def train_classifier():
    """ Train a MaxEnt classifier and return it """
    labeled_featuresets = [(MEMM_features(word, tag, previous_tag), tag)
                           for (word, tag, previous_tag) in labeled_features]
    maxent_classifier = MaxentClassifier.train(labeled_featuresets,
                                               max_iter=50)
    return maxent_classifier
Example #2
0
def evaluate_features(trainFeatures, testFeatures):
    """
    Train and evaluate the classifier model.
    """
    classifier = MaxentClassifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    print 'train on %d instances, test on %d instances' % (len(trainFeatures),
                                                           len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print '**************************************'
    print 'pos precision:', precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', recall(referenceSets['pos'], testSets['pos'])
    print 'pos fmeasure:', f_measure(referenceSets['pos'], testSets['pos'])
    print '**************************************'
    print 'neg precision:', precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', recall(referenceSets['neg'], testSets['neg'])
    print 'neg fmeasure:', f_measure(referenceSets['neg'], testSets['neg'])
    classifier.show_most_informative_features(50)
def IIS(num_folds, featuresets, label_list):
    subset_size = int(len(featuresets) / num_folds)
    # overall gold labels for each instance (reference) and predicted labels (test)
    reflist = []
    testlist = []
    accuracy_list = []
    print("IIS Classifier")
    # iterate over the folds
    for i in range(num_folds):
        print('Start Fold', i)
        test_this_round = featuresets[i * subset_size:][:subset_size]
        train_this_round = featuresets[:i * subset_size] + featuresets[
            (i + 1) * subset_size:]
        # train using train_this_round
        classifier = MaxentClassifier.train(train_this_round,
                                            'IIS',
                                            max_iter=1)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier,
                                                     test_this_round)
        print(i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)

        # add the gold labels and predicted labels for this round to the overall lists
        for (features, label) in test_this_round:
            reflist.append(label)
            testlist.append(classifier.classify(features))

    print('Done with cross-validation')
    # call the evaluation measures function
    print('mean accuracy-', sum(accuracy_list) / num_folds)
    (precision_list, recall_list) = eval_measures(reflist, testlist,
                                                  label_list)
    print_evaluation(precision_list, recall_list, label_list)
    print(" ")
Example #4
0
    def train(self, file_train):
        """

        :param file_train:
        :return:
        """
        self.states = set([])

        # me classifier
        labeled_featuresets = []  # list of (feature_dict, lable)

        iter = PreprocessUtil.file_iter(file_train)
        sent = iter.__next__()
        while sent:
            prev_state = self.zero_state
            for tokconll in sent:
                obs, _, state = tokconll.strip().split("\t")

                self.states.add(state)

                feature_dict = {"prev_state": prev_state, "obs": obs}
                labeled_featuresets.append((feature_dict, state))
                prev_state = state

            sent = iter.__next__()

        self.classifier = MaxentClassifier.train(labeled_featuresets,
                                                 max_iter=self.max_iter)
def treina_classificadores():
    posdados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_1.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            posdados.append(val[0])
    negdados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_0.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            negdados.append(val[0])
    neudados = []
    with open('./dadostreino/train_EPTC_POA_v3nbal_2.data', 'rb') as myfile:
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            neudados.append(val[0])
    negfeats = [(bag_of_words(f), 'neg') for f in divide(negdados)]
    posfeats = [(bag_of_words(f), 'pos') for f in divide(posdados)]
    neufeats = [(bag_of_words(f), 'neu') for f in divide(neudados)]
    treino = negfeats + posfeats + neufeats
    #'Maximum Entropy'
    classificadorME = MaxentClassifier.train(treino,
                                             'GIS',
                                             trace=0,
                                             encoding=None,
                                             labels=None,
                                             gaussian_prior_sigma=0,
                                             max_iter=1)
    #SVM
    classificadorSVM = SklearnClassifier(LinearSVC(), sparse=False)
    classificadorSVM.train(treino)
    # Naive Bayes
    classificadorNB = NaiveBayesClassifier.train(treino)
    return ([classificadorME, classificadorSVM, classificadorNB])
Example #6
0
def searchMaxentClassifier(title, train_departments):
    """

    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    classifier = MaxentClassifier.train(train_departments, max_iter=5)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
Example #7
0
def ddi(train_inputdir, devel_inputdir, outputfile):
    # ddi was changed: previous code was moved to "feature extractor" to isolate
    # the proceess from the learner and the classifier. Vectors are a list of 2 values: the string with the sentence_id
    # and the ids of the entity pair, which are needed for the evaluator, and a second value with the featuresets
    # required by MaxentClassifier

    training_vector = feature_extractor(train_inputdir, True)
    test_vector = feature_extractor(devel_inputdir, False)

    featuresets = []
    for featureset in training_vector:
        featuresets = featuresets + featureset[1]
    classifier = MaxentClassifier.train(featuresets,
                                        algorithm="iis",
                                        max_iter=50)

    file = open(outputfile, "w")
    # The classifier is called for each featureset, in order to assign
    for featureset in test_vector:
        result = classifier.classify(featureset[1])
        if result == "null":
            file.write(featureset[0] + '|0|' + result + '\n')
        else:
            file.write(featureset[0] + '|1|' + result + '\n')

    evaluate(DEVEL_INPUT_DIR, outputfile)
Example #8
0
 def __maxent_train(fs):
     return MaxentClassifier.train(fs, 
         algorithm=algorithm,
         gaussian_prior_sigma=gaussian_prior_sigma,
         count_cutoff=count_cutoff,
         min_lldelta=min_lldelta,
         trace=trace)
def ME_gender(train_set, test_set):
    print('== NLTK MaxEnt ==')
    from nltk.classify import MaxentClassifier
    nltk_classifier = MaxentClassifier.train(
        train_set, nltk.classify.MaxentClassifier.ALGORITHMS[0])
    print(nltk_classifier.prob_classify(gender_features('mark'))._prob_dict)
    print(nltk.classify.accuracy(nltk_classifier, test_set))
Example #10
0
 def _train(self, algo='iis', trace=0, max_iter=10):
     '''
     Internal method to train and return a NLTK maxent classifier.
     ''' 
     data = [(p.text, p.quote) for p in train_query]
     train_set = [(get_features(n), g) for (n, g) in data]
     return MaxentClassifier.train(train_set, algorithm=algo, trace=trace, max_iter=max_iter)
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn_analysis);

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print count_table
Example #12
0
def main():
    # grab xml trees
    train_filename = sys.argv[1]
    test_filename = sys.argv[2]
    train_tree = et.parse(train_filename)
    test_tree = et.parse(test_filename)
    train_root = train_tree.getroot()
    test_root = test_tree.getroot()

    # labeled reviews
    train_labels = [get_label(review, 'train') for review in train_root]
    test_labels = [get_label(review, 'test') for review in train_root]

    top_adjs_and_advs = get_top_adjs_n_advs(train_labels, 2)

    # randomize
    random.shuffle(train_labels)
    random.shuffle(test_labels)

    # feature sets
    train_set = [(get_features(review, 'train', top_adjs_and_advs), label)
                 for review, label in train_labels]
    test_set = [(get_features(review, 'test',
                              top_adjs_and_advs), review[ASIN].text)
                for review in test_labels]

    # train classifier
    classifier = MaxentClassifier.train(train_set, trace=0)

    # print results
    print_results(classifier, test_set)
Example #13
0
 def _train(self, algo='iis', trace=0, max_iter=10):
     '''
     Internal method to train and return a NLTK maxent classifier.
     ''' 
     data = [(p.text, p.quote) for p in train_query]
     train_set = [(get_features(n), g) for (n, g) in data]
     return MaxentClassifier.train(train_set, algorithm=algo, trace=trace, max_iter=max_iter)
 def train(cls):
     train_set = cls.get_final_train_set()
     classifier = maxent.train(train_set, cls.MAXENT_ALGORITHM, trace=0, max_iter=1000)
     # save classifier
     f = open(cls.CLASSIFIER_FILE, 'wb')
     pickle.dump(classifier, f)
     f.close()
def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    tweets = classify.get_tweets_to_classify(conn_analysis)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = classifier.classify(classify.process_tweet(text))
        update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print count_table
Example #16
0
 def train(self, d):
     """
     Given a labeled set, train our classifier.
     """
     t = self.__tag_data_set(d)
     self.classifier = MaxentClassifier.train(t)
     logging.info("Training on %s records complete." % len(d))
def me_classifier(exclude_list):
    me_classifier = 0

    with open(train_data, 'r', encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.reader(csvfile)
        feature_set = [(feature_set_generator(text, length, label,
                                              exclude_list), label)
                       for text, length, label in reader]
        #print(feature_set)
        me_classifier = MaxentClassifier.train(feature_set, "megam")

    accuracy = 0.0
    with open(test_data, 'r', encoding='utf-8',
              errors='ignore') as testcsvfile:
        test_reader = csv.reader(testcsvfile)
        test_feature_set = [(feature_set_generator(text, length, label,
                                                   exclude_list), label)
                            for text, length, label in test_reader]
        accuracy = classify.accuracy(me_classifier, test_feature_set)

    classified = collections.defaultdict(set)
    observed = collections.defaultdict(set)
    i = 1
    with open(test_data, 'r', encoding='utf-8',
              errors='ignore') as testcsvfile:
        test_reader = csv.reader(testcsvfile)
        for text, length, label in test_reader:
            observed[label].add(i)
            classified[me_classifier.classify(
                feature_set_generator(text, length, label,
                                      exclude_list))].add(i)
            i += 1

    return accuracy,precision(observed["1"], classified["1"]),recall(observed['1'], classified['1']),\
           f_measure(observed['1'], classified['1']),precision(observed['0'], classified['0']),recall(observed['1'], classified['0']),f_measure(observed['1'], classified['0'])
Example #18
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}


	test_tweets = classify.get_test_tweets(conn_analysis)
	test_feature_set = process_tweets(test_tweets)

	classifier.show_most_informative_features(10)
	classifier_accuracy = accuracy(classifier, test_feature_set)
	print "classifier accuracy: " + repr(classifier_accuracy)
Example #19
0
 def __maxent_train(fs):
     return MaxentClassifier.train(
         fs,
         algorithm=algorithm,
         gaussian_prior_sigma=gaussian_prior_sigma,
         count_cutoff=count_cutoff,
         min_lldelta=min_lldelta,
         trace=trace)
 def train(cls, aspect):
     print cls.get_features(aspect)
     print cls.get_classifier_name(aspect)
     train_set = cls.get_features(aspect)[:int(0.7*cls.LABELED_NUM)]
     classifier = maxent.train(train_set, 'IIS', trace=0, max_iter=1000)
     # save classifier
     f = open(cls.get_classifier_name(aspect), 'wb')
     pickle.dump(classifier, f)
     f.close()
Example #21
0
def evaluate_classifier(featureX):

    negIds = app_reviews.fileids('neg')
    posIds = app_reviews.fileids('pos')

    posFeatures = [(featureX(app_reviews.words(fileids=[f])), 'neg')
                   for f in negIds]
    negFeatures = [(featureX(app_reviews.words(fileids=[f])), 'pos')
                   for f in posIds]

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))

    trainFeatures = negFeatures[:negCutoff] + posFeatures[:posCutoff]
    testFeatures = negFeatures[negCutoff:] + posFeatures[posCutoff:]

    #trains a Naive Bayes Classifier
    NBclassifier = NaiveBayesClassifier.train(trainFeatures)
    #trains a Maximum Entropy or Logistic Regression Classifier
    MEclassifier = MaxentClassifier.train(trainFeatures,
                                          algorithm='gis',
                                          trace=0,
                                          max_iter=10,
                                          min_lldelta=0.5)
    #trains a DecisionTree Classifier
    DTclassifier = DecisionTreeClassifier.train(trainFeatures,
                                                binary=True,
                                                entropy_cutoff=0.5,
                                                depth_cutoff=70,
                                                support_cutoff=10)

    #Combining Classifiers with Voting
    classifier = MaxVoteClassifier(NBclassifier, MEclassifier, DTclassifier)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        observed = classifier.classify(features)
        testSets[observed].add(i)

    #prints metrics to show how well the feature selection
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print 'Accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos Precision:', nltk.metrics.precision(referenceSets['pos'],
                                                   testSets['pos'])
    print 'pos Recall:', nltk.metrics.recall(referenceSets['pos'],
                                             testSets['pos'])
    print 'neg Precision:', nltk.metrics.precision(referenceSets['neg'],
                                                   testSets['neg'])
    print 'neg Recall:', nltk.metrics.recall(referenceSets['neg'],
                                             testSets['neg'])
def classify_maxent(X_train, Y_train, X_test):
    training_input = X_train
    training_output = Y_train
    training_data = []
    for i in range(len(training_input)):
        training_data.append((training_input[i], training_output[i]))
    clf = MaxentClassifier.train(training_data)
    pred_labels = clf.classify_many(X_test)
    return pred_labels
Example #23
0
def maxenscore(trainset, testset):
    me_classifier = MaxentClassifier.train(trainset,
                                           algorithm='iis',
                                           trace=0,
                                           max_iter=1,
                                           min_lldelta=0.5)
    # (test, tag_test) = zip(*testset)
    # pred = me_classifier.classify(test)
    return nltk.classify.accuracy(me_classifier, testset)
def classify_maxent(X_train, Y_train, X_test):
    training_input = X_train
    training_output = Y_train
    training_data = []
    for i in range(len(training_input)):
        training_data.append((training_input[i], training_output[i]))
    clf = MaxentClassifier.train(training_data)
    pred_labels = clf.classify_many(X_test)
    return pred_labels
Example #25
0
def main():
    negfeats = []
    posfeats = []
    for i, f in enumerate(reviews[0]):
        print(f)
        if reviews[1][i] == 0:
            negfeats.append((word_feats(f.split()), "neg"))
        else:
            posfeats.append((word_feats(f.split()), "pos"))

    testNegfeats = []
    testPosfeats = []
    for i, f in enumerate(test[0]):
        if test[1][i] == 0:
            testNegfeats.append((word_feats(f.split()), "neg"))
        else:
            testPosfeats.append((word_feats(f.split()), "pos"))

    trainfeats = negfeats + posfeats
    testfeats = testNegfeats + testPosfeats

    print('train on %d instances, test on %d instances - Maximum Entropy' %
          (len(trainfeats), len(testfeats)))

    classifier = MaxentClassifier.train(trainfeats,
                                        'GIS',
                                        trace=0,
                                        encoding=None,
                                        labels=None,
                                        gaussian_prior_sigma=0,
                                        max_iter=1)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = precision(refsets['pos'], testsets['pos'])
    pos_recall = recall(refsets['pos'], testsets['pos'])
    pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
    neg_precision = precision(refsets['neg'], testsets['neg'])
    neg_recall = recall(refsets['neg'], testsets['neg'])
    neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])
    print(pos_recall)
    print(neg_recall)
    print()
    print('')
    print('---------------------------------------')
    print('          Maximum Entropy              ')
    print('---------------------------------------')
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
Example #26
0
 def train(cls, aspect):
     print cls.get_features(aspect)
     print cls.get_classifier_name(aspect)
     train_set = cls.get_features(aspect)[:int(0.7 * cls.LABELED_NUM)]
     classifier = maxent.train(train_set, 'IIS', trace=0, max_iter=1000)
     # save classifier
     f = open(cls.get_classifier_name(aspect), 'wb')
     pickle.dump(classifier, f)
     f.close()
Example #27
0
 def train(cls):
     train_set = cls.get_final_train_set()
     classifier = maxent.train(train_set,
                               cls.MAXENT_ALGORITHM,
                               trace=0,
                               max_iter=1000)
     # save classifier
     f = open(cls.CLASSIFIER_FILE, 'wb')
     pickle.dump(classifier, f)
     f.close()
Example #28
0
 def maxent_train (self):
 
     self.classifier_all = MaxentClassifier.train (self.maxent_memes_all, trace=100, max_iter=5)
     #classifier_bottom = MaxentClassifier.train (maxent_memes_bottom, trace=100, max_iter=250)
     #classifier_all = MaxentClassifier.train (maxent_memes_all, trace=100, max_iter=250)
     weights = self.classifier_all.weights()
     f = open ("lambdas.txt", "w")
     for weight in weights:
         f.write("weight = %f" % weight)
         f.write ("\n")
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_tweet_polarity_ensemble(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Maximum Entropy"
	print count_table

	#generate the accuracy matrix
	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	for tweet in test_tweets:
		result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
		guess = result[0][0]

		actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0])
		actual = actual_result[0][0]

		if guess is not None:
			if actual is not None:
				full_matrix[actual][guess] += 1

	print full_matrix
def maxent_train(feature_list):
    labeled_features = []
    for (word, tag, shape, label, prev_word, prev_tag, prev_shape,
         previous_label) in feature_list:
        labeled_features.append(
            (feature_template(word, tag, shape, prev_word, prev_tag,
                              prev_shape, previous_label), label))
    f = open("../DataFiles/ner_labeler.pickle", "wb")
    maxent_classifier = MaxentClassifier.train(labeled_features, max_iter=40)
    pickle.dump(maxent_classifier, f)
    f.close()
Example #31
0
def train_maxent_classifier(labelled_features):

    train_set = []
    for lf in labelled_features:

        train_set.append((Generate_MEMM_features(lf), lf[0]))

    print("\nTraining Maxent Classifier on train.txt.")

    maxent_classifier = MaxentClassifier.train(train_set, max_iter=15)
    return maxent_classifier
Example #32
0
def train_classifier(train_data, labels):
    data = []
    for i in range(len(train_data)):
        data.append((train_data[i], labels[i]))
    print('starting')
    classifier = MaxentClassifier.train(data,
                                        algorithm='GIS',
                                        trace=0,
                                        max_iter=6)
    print('done')
    return classifier
def axentClassifier(features_train, features_test):
	print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test))
	classifier = MaxentClassifier.train(features_train,algorithm='gis')
	print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
	precisions, recalls = precision_recall(classifier, features_test)
	print "accuracy: ", precisions, "fitness: ", recalls

# def sklearnMultinomialNB(features_train, features_test):
# 	print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test))
# 	classifier = SklearnClassifier(MultinomialNB())
# 	classifier.train
# 	print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
Example #34
0
def run(training):
    """
    To create and train a MaxentClassifier
    :return: a trained Classifier
    """
    print "Training ME Classifier..."
    # feats = label_feat_from_corps(movie_reviews)
    # training, testing = split_label_feats(feats)

    me_classifier = MaxentClassifier.train(training, algorithm='GIS', trace=0, max_iter=10, min_lldelta=0.5)
    print "ME Classifier trained..."
    return save_classifier(me_classifier)
Example #35
0
def test_maxent(algorithm):
    print('%11s' % algorithm)
    try:
        classifier = MaxentClassifier.train(
                         train, algorithm, trace=0, max_iter=1000)
    except Exception as e:
        print('Error: %r' % e)
        return

    for featureset in test:
        pdist = classifier.prob_classify(featureset)
        print('%8.15f %6.15f' % (pdist.prob('x'),  pdist.prob('y')))
    print()
Example #36
0
def trainMaxent(featuresets):
    #idx = 2*len(featuresets) / ratio
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    train_set = featuresets
    algo = MaxentClassifier.ALGORITHMS[1]
    #max_iter=20
    classifier = MaxentClassifier.train(train_set, algo, max_iter=3)
    #print accuracy(classifier, test_set)
    classifier.show_most_informative_features(100)
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    #classifier.train(train_set, algo, max_iter=20)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    return classifier
Example #37
0
    def train(self):
        self.clf_one_step = MaxentClassifier.train(self.train_one_step,
                                                   'megam',
                                                   trace=0,
                                                   max_iter=1000)

        self.clfs_two_step = {
            2:
            MaxentClassifier.train(self.train_two_step[2],
                                   'megam',
                                   trace=0,
                                   max_iter=1000),
            3:
            MaxentClassifier.train(self.train_two_step[3],
                                   'megam',
                                   trace=0,
                                   max_iter=1000),
            4:
            MaxentClassifier.train(self.train_two_step[4],
                                   'megam',
                                   trace=0,
                                   max_iter=1000),
        }
Example #38
0
    def train(
        cls,
        docs: Collection[Document],
        stopwords: Optional[Collection[Word]] = None,
        algorithm: str = 'iis',
        cutoff: int = 4,
        sigma: float = 0.,
        trim_length: int = 10,
    ) -> 'MaxentSummarizer':
        """Train the model on a collection of documents.

        Args:
            docs (Collection[Document]): The collection of documents to train on.
            stopwords (Collection[Word]): Collection of stopwords.
            algorithm (str): Optimization algorithm for training. Possible values are 'iis',
                'gis', or 'megam' (requires `megam`_ to be installed).
            cutoff (int): Features that occur fewer than this value in the training data will
                be discarded.
            sigma (float): Standard deviation for the Gaussian prior. Default is no prior.
            trim_length (int): Trim words to this length.

        Returns:
            MaxEntropy: The trained model.

        .. _megam: https://www.umiacs.umd.edu/~hal/megam/
        """
        if stopwords is None:
            stopwords = set()

        word_pairs = {
            pair
            for doc in docs for sent in doc.sentences for pair in
            cls._get_word_pairs(sent, stopwords, trim_len=trim_length)
        }

        train_data: list = []
        for doc in docs:
            featuresets = cls._extract_featuresets(doc, stopwords, word_pairs,
                                                   trim_length)
            labels = [sent.label for sent in doc.sentences]
            train_data.extend(zip(featuresets, labels))

        encoding = BinaryMaxentFeatureEncoding.train(train_data,
                                                     count_cutoff=cutoff,
                                                     alwayson_features=True)
        classifier = MaxentClassifier.train(train_data,
                                            algorithm=algorithm,
                                            encoding=encoding,
                                            gaussian_prior_sigma=sigma)
        return cls(classifier, stopwords=stopwords, word_pairs=word_pairs)
Example #39
0
def trainMaxent10(featuresets):
    #idx = 2*len(featuresets) / ratio
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    train_set = featuresets
    algo = MaxentClassifier.ALGORITHMS[1]
    #max_iter=20
    classifier = MaxentClassifier.train(train_set, algo, max_iter=10)
    #print accuracy(classifier, test_set)
    classifier.show_most_informative_features(100)
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    #classifier.train(train_set, algo, max_iter=20)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    return classifier
Example #40
0
    def __init__(self, ftrain, fdev, ftest):
        self.train = json.load(open(ftrain))
        self.dev = json.load(open(fdev))
        # self.test = json.load(open(ftest))

        step1_train_features, step2_train_features = self.extract_features(
            self.train)
        step1_dev_features, step2_dev_features = self.extract_features(
            self.dev)
        # step1_test_features, step2_test_features = self.extract_features(self.test)

        p.dump(step1_train_features,
               open('data/step1_train_features.cPickle', 'w'))
        p.dump(step2_train_features,
               open('data/step2_train_features.cPickle', 'w'))

        p.dump(step1_dev_features, open('data/step1_dev_features.cPickle',
                                        'w'))
        p.dump(step2_dev_features, open('data/step2_dev_features.cPickle',
                                        'w'))

        clf_step1 = MaxentClassifier.train(step1_train_features,
                                           'megam',
                                           trace=0,
                                           max_iter=1000)
        # clf = nltk.NaiveBayesClassifier.train(trainset)
        p.dump(clf_step1, open('data/clf_step1.cPickle', 'w'))
        print 'Accuracy: ', accuracy(clf_step1, step1_dev_features)

        clf_step2 = MaxentClassifier.train(step2_train_features,
                                           'megam',
                                           trace=0,
                                           max_iter=1000)
        # clf = nltk.NaiveBayesClassifier.train(trainset)
        p.dump(clf_step2, open('data/clf_step2.cPickle', 'w'))
        print 'Accuracy: ', accuracy(clf_step2, step2_dev_features)
    def __init__(self):
        try:
            classifier = None
            if not os.path.exists(classifier_path):
                '''with open('nltk_sentiment_data/polarity_pos.txt', 'rb') as fp:
                    pos_lines = fp.readlines()
                    pos_feats = [(word_feats(tokenizer.tokenize(p_line)), '1') for p_line in pos_lines]
                with open ('nltk_sentiment_data/polarity_neg.txt', 'rb') as fn:
                    neg_lines = fn.readlines()
                    neg_feats = [(word_feats(tokenizer.tokenize(n_line)), '0') for n_line in neg_lines]'''

                filename = os.path.dirname(
                    os.path.abspath(__file__)
                ) + "/nltk_sentiment_data/sentiment_data_twitter.txt"
                with open(filename, 'rb') as fp:
                    lines = fp.readlines()
                    feats = [(word_feats(
                        tokenizer.tokenize(
                            line.split(' -> ')[1].strip().lower())),
                              line.split(' -> ')[0]) for line in lines
                             if len(line.split(' -> ')) >= 2]
                print "Total : %s" % (len(feats), )
                cutoff = int(len(feats) * 0.1)
                trainfeats, testfeats = feats[cutoff:], feats[:cutoff]
                '''cutoff = int(len(pos_feats) * 0.1)
                trainfeats = pos_feats[cutoff:] + neg_feats[cutoff:]
                testfeats = pos_feats[:cutoff] + neg_feats[:cutoff]'''
                print 'train on %d instances, test on %d instances' % (
                    len(trainfeats), len(testfeats))

                #classifier = NaiveBayesClassifier.train(trainfeats)
                classifier = MaxentClassifier.train(trainfeats,
                                                    algorithm='iis',
                                                    trace=0,
                                                    max_iter=10)
                print 'accuracy:', nltk.classify.util.accuracy(
                    classifier, testfeats)
                classifier.show_most_informative_features()
                with open(classifier_path, "w") as fh:
                    cPickle.dump(classifier, fh, 1)
            else:
                with open(classifier_path, "r") as fh:
                    classifier = cPickle.load(fh)
            self.classifier = classifier
            logger.info("Initialized SentimentClassifier instance..")
        except Exception, e:
            logger.exception(e)
            raise e
Example #42
0
 def train(cls, training_sequence, **kwargs):
     feature_detector = kwargs.get('feature_detector')
     gaussian_prior_sigma = kwargs.get('gaussian_prior_sigma', 10)
     count_cutoff = kwargs.get('count_cutoff', 1)
     stopping_condition = kwargs.get('stopping_condition', 1e-7)
     def __featurize(tagged_token):
         tag = tagged_token[-1]
         feats = feature_detector(tagged_token)
         return (feats, tag)
     labeled_featuresets = LazyMap(__featurize, training_sequence)
     classifier = MaxentClassifier.train(labeled_featuresets,
                             algorithm='megam',
                             gaussian_prior_sigma=gaussian_prior_sigma,
                             count_cutoff=count_cutoff,
                             min_lldelta=stopping_condition)
     return cls(classifier._encoding, classifier.weights())
Example #43
0
def trainCorpus():
	if os.path.exists(classifier_fname):
		return LoadClassifier()
	else:
		c = getDealsCorpus()
		hiwords = corpus_high_info_words(c)
		featdet = lambda words: bag_of_words_in_set(words, hiwords)
		train_feats, test_feats = corpus_train_test_feats(c, featdet)
		trainf = lambda train_feats: MaxentClassifier.train(train_feats, algorithm='megam', trace=0, max_iter=10)
		labelset = set(c.categories())
		classifiers = train_binary_classifiers(trainf, train_feats, labelset)
		multi_classifier = MultiBinaryClassifier(*classifiers.items())
		multi_p, multi_r, avg_md = multi_metrics(multi_classifier, test_feats)
		print multi_p['activitiesevents'], multi_r['activitiesevents'], avg_md
		SaveClassifier(multi_classifier)
		return multi_classifier
Example #44
0
    def train(self, featureset=None):
        """
        Trains the maximum entropy classifier and returns it. If a
        featureset is specified it trains on that, otherwise it trains on
        the models featureset.

        Pass in a featureset during cross validation.
        Returns the training time and the classifier.
        """
        featureset = featureset or self.featureset()

        # Time how long it takes to train
        start = time.time()

        classifier = MaxentClassifier.train(featureset,
                        algorithm='megam', trace=1, gaussian_prior_sigma=1)

        delta = time.time() - start
        return classifier, delta
Example #45
0
def parse():
    tagger_classes=([nltk.UnigramTagger, nltk.BigramTagger])
    trained_sents, tagged_sents =  trainer("WSJ_02-21.pos-chunk","WSJ_23.pos")
    #tagger = nltk.UnigramTagger(trained_sents)
    print len(trained_sents)
    tagger = ClassifierBasedPOSTagger(train=trained_sents[:10000], classifier_builder=lambda train_feats: 
    MaxentClassifier.train(train_feats, trace = 0,max_iter=10))
    f = open("WSJ_23.chunk",'w')
        #print sents
    for sents in tagged_sents:
        (words,tags)=sents[0],sents[1]
        chunks = tagger.tag(tags)
        #print words, chunks
        wtc = zip(words, chunks)


        for tup in wtc:
	   f.write("%s\t%s\n" %(tup[0],tup[1][1]))

        f.write("\n")
def main_function():
    conn = MySQLdb.connect(
        host=DATABASES["date_cutoff"]["HOST"],
        user=DATABASES["date_cutoff"]["USER"],
        passwd=DATABASES["date_cutoff"]["PASSWORD"],
        db=DATABASES["date_cutoff"]["NAME"],
    )

    training_tweets = get_test_tweets(conn)
    # training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, "+", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "-", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "I", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "O", total_word_count, best_words)

    print "configuring megam"
    config_megam("/opt/packages")
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'-':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'I':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    # for f in test_tweets:
    # 	guess = classifier.classify(process_tweet(f[1]))
    # 	full_matrix[f[2]][guess] += 1

    # print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)
    def __init__(self):
        try:
            classifier = None
            if not os.path.exists(classifier_path):

                '''with open('nltk_sentiment_data/polarity_pos.txt', 'rb') as fp:
                    pos_lines = fp.readlines()
                    pos_feats = [(word_feats(tokenizer.tokenize(p_line)), '1') for p_line in pos_lines]
                with open ('nltk_sentiment_data/polarity_neg.txt', 'rb') as fn:
                    neg_lines = fn.readlines()
                    neg_feats = [(word_feats(tokenizer.tokenize(n_line)), '0') for n_line in neg_lines]'''

                filename = os.path.dirname(os.path.abspath(__file__)) + "/nltk_sentiment_data/sentiment_data_twitter.txt"
                with open(filename, 'rb') as fp:                 
                    lines = fp.readlines()
                    feats =[(word_feats(tokenizer.tokenize(line.split(' -> ')[1].strip().lower())), line.split(' -> ')[0]) for line in lines if len(line.split(' -> ')) >=2]
                print "Total : %s" %(len(feats),)
                cutoff = int(len(feats)*0.1)
                trainfeats, testfeats = feats[cutoff:], feats[:cutoff]

                '''cutoff = int(len(pos_feats) * 0.1)
                trainfeats = pos_feats[cutoff:] + neg_feats[cutoff:]
                testfeats = pos_feats[:cutoff] + neg_feats[:cutoff]'''
                print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
                #classifier = NaiveBayesClassifier.train(trainfeats)
                classifier = MaxentClassifier.train(trainfeats, algorithm='iis', trace=0, max_iter=10)
                print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
                classifier.show_most_informative_features()
                with open(classifier_path, "w") as fh:
                    cPickle.dump(classifier, fh, 1)
            else:
                with open(classifier_path, "r") as fh:
                    classifier = cPickle.load(fh)
            self.classifier = classifier
            logger.info("Initialized SentimentClassifier instance..")
        except Exception, e:
            logger.exception(e)
            raise e
Example #48
0
    def _train_mode_for_user(self, userid):
        if userid in self._user_classifier:
            print("Already exist!!!")
            self._user_classifier[userid] = None
        # 只需要好评和差评的,一般评论不做参考,所以INNER JOIN足够
        sql = """ SELECT site_news.news_uuid, user_score.news_user_score as news_score FROM site_news 
                        INNER JOIN user_score ON site_news.news_uuid = user_score.newsid 
                  WHERE DATE(site_news.time) < CURRENT_DATE() AND DATE(site_news.time) > DATE_SUB(CURRENT_DATE(),INTERVAL 5 DAY) 
                        AND user_score.news_user_score != 1 AND user_score.userid=%d; """ %(userid)
        train_items = self.db_conn.query(sql);

        print("建立POS/NEG特征")
        pos_feature = []
        neg_feature = []        
        for item in train_items:
            news_vector = nlp_master.get_old_vect(item['news_uuid']);
            if item['news_score'] == 0: #好评
                pos_feature.append((self.best_word_features(news_vector, news_vector),'pos'))
            elif item['news_score'] == 2: #差评
                neg_feature.append((self.best_word_features(news_vector, news_vector),'neg'))
        print("POS:%d, NEG:%d" %(len(pos_feature),len(neg_feature)))
        
        if len(pos_feature) <= 3 or len(neg_feature) <=3:
            print("特征太少,放弃。。。")
            self._user_classifier[userid] = None
            return
        
        trainSet = pos_feature + neg_feature
        self._user_classifier[userid] = MaxentClassifier.train(trainSet, max_iter=50)
        print("MaxEnt Classifier for %d build done!"%(userid))

        # 保存更新结果
        today = datetime.date.today()
        self.dumpfile = "dumpdir/recmaxent_dump.%d_%d" %(today.month, today.day)
        with open(self.dumpfile,'wb', -1) as fp:
            dump_data = []
            dump_data.append(self._user_classifier)
            pickle.dump(dump_data, fp, -1)              
        return
def evaluate_classifier(featx,collocationFunc):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids]

    lenNegFeats=min(len(negfeats),400)
    lenPosFeats=min(len(posfeats),400)
#    lenNegFeats=len(negfeats)
#    lenPosFeats=len(posfeats)
    negcutoff = int(lenNegFeats*3/4)
    poscutoff = int(lenPosFeats*3/4)
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats]
 
    classifier = MaxentClassifier.train(trainfeats,algorithm='IIS',max_iter=3)
    print(classifier)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    print(classifier)
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
    evaluationMetrics={}
    classifier.show_most_informative_features()
    evaluationMetrics['model']=classifier
    evaluationMetrics['trainingData']=trainfeats
    evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats)
    evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos'])
    evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos'])
    evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos'])
    evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg'])
    evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg'])
    evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg'])
    return evaluationMetrics
def main_function():
	conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart")

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	tweets = classify.get_tweets_to_classify(conn_analysis);

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_max_ent_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print "Maximum Entropy"
	print count_table
Example #51
0
def getClassifier(tweetfile):
    print "Loading content & preparing text"
    content = prepText(loadFile(tweetfile))
    print "Categorizing contents"
    categorized = prepClassifications(content)
    print "Deriving NGrams"
    NGrammized = collectNGrams(categorized,degreesToUse)
    print "Compiling Results"
    readyToSend = []
    for category in NGrammized.keys():
        readyToSend += NGrammized[category]
        
    print "Attempting Classification"
    if classMode == 'naive bayes':
        from nltk.classify import NaiveBayesClassifier
        classifier = NaiveBayesClassifier.train(readyToSend)
    elif classMode == 'max ent':
        from nltk.classify import MaxentClassifier
        classifier = MaxentClassifier.train(readyToSend)
        
    print
    classifier.show_most_informative_features(n=200)
    classifier.show_most_informative_features()
    return classifier
Example #52
0
    pos = [simplify_wsj_tag(tag) for word, tag in pos]
    words = [w.lower() for w in words]
    trigrams = nltk.trigrams(words)
    trigrams = ['%s/%s/%s' % (i[0], i[1], i[2]) for i in trigrams]
    features = words + pos + trigrams
    features = dict((f, True) for f in features)
    return features

def train():
    for k,v in training.items():
        for sentence in v:
            yield (prepare_input(sentence), k)

if __name__ == '__main__':
    import sys
    training_set = []
    for features in train():
        training_set.append(features)
    classifier = MaxentClassifier.train(training_set)
    layer_type, slug = sys.argv[1:3]
    r = requests.get('http://randomtaco.me/%s/%s/' % (layer_type, slug))
    recipe = r.json()['recipe']
    recipe_sentences = nltk.sent_tokenize(recipe)
    for sent in recipe_sentences:
        print classifier.classify(prepare_input(sent))
        print '*'*80




Example #53
0
	puc = '-'.decode("utf-8")
	 #some char is outof ASCII
	print (word)
	features['capitalization'] = word[0].isupper()
	features['start_of_sentence'] = word in wordStartList
	features['cap_start'] = word not in wordStartList and word[0].isupper()
	features['previous_NC'] = previous_BOI

	return features
#*******************************************************************train the model
labeled_featuresets = [(MEMM_features(word, tag, previous_BOI), boi )for (word, tag, boi, previous_BOI) in labeled_features]
train_set = labeled_featuresets

f = open("my_classifier.pickle", "wb")

maxent_classifier = MaxentClassifier.train(train_set, max_iter=30)
pickle.dump(maxent_classifier , f)

f.close() 
#********************************************************************Viterbi
def MEMM(wordList,tagList):
	BOI_list = ['B-NP', 'I-NP', 'O']
	w1 = wordList[0] #the first word of the sentence
	t1 = tagList[0]
	tRange = len(BOI_list)
	wRange = len(wordList)

	viterbi = [[0 for x in range(300)] for x in range(300)] 
	backpointer = [['' for x in range(300)] for x in range(300)] 
	#intialization
	for t in range(tRange):#t = 0,1,2
 def _classifier_builder(self, train):
     return MaxentClassifier.train(train, algorithm='megam',
                                        gaussian_prior_sigma=1,
                                        trace=2)
Example #55
0
def evaluate_classifier(featx, number_of_features, remove_stopwords):
    print "Adding features..."

    # create labeled bags of words (dictionary)
    neg_reviews = reviews[1] + reviews[2]
    pos_reviews = reviews[4] + reviews[5]
    random.shuffle(neg_reviews)
    random.shuffle(pos_reviews)
    neg_feats = [(featx(f, number_of_features), 'neg') for f in neg_reviews]
    pos_feats = [(featx(f, number_of_features), 'pos') for f in pos_reviews]

    neg_cutoff = len(neg_feats) * 3 / 4
    pos_cutoff = len(pos_feats) * 3 / 4

    trainfeats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
    testfeats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]

    neg_sent_reviews = reviews_sents[1] + reviews_sents[2]
    pos_sent_reviews = reviews_sents[4] + reviews_sents[5]
    random.shuffle(neg_sent_reviews)
    random.shuffle(pos_sent_reviews)
    neg_sent_feats = [(sentences, 'neg') for sentences in neg_sent_reviews]
    pos_sent_feats = [(sentences, 'pos') for sentences in pos_sent_reviews]
    test_sent_feats = neg_sent_feats[neg_cutoff:] + pos_sent_feats[pos_cutoff:]

    classifierName = "Maximum Entropy (Features: Words"
    if remove_stopwords:
        classifierName += ", Removed Stopwords"
    if number_of_features > 1:
        classifierName += ", Stemmed Words"
    if number_of_features > 2:
        classifierName += ", Lemmatized Words"
    if number_of_features > 3:
        classifierName += ", Bigrams"

    classifierName += ")"

    print "Training..."
    classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True,
                                        gaussian_prior_sigma=0, max_iter=1)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    sent_refsets = collections.defaultdict(set)
    sent_testsets = collections.defaultdict(set)

    print "Testing..."
    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    print "Testing on Sentences..."

    accurate_count = 0
    for i, (sentences, label) in enumerate(test_sent_feats):
        sent_refsets[label].add(i)
        pos_prob_total = 0
        neg_prob_total = 0
        sentence_count = len(sentences)
        for sentence in sentences:
            pdist = classifier.prob_classify(featx(sentence, 4))
            pos_prob_total += pdist.prob('pos')
            neg_prob_total += pdist.prob('neg')
        if ((pos_prob_total / sentence_count) > (neg_prob_total / sentence_count)):
            results = 'pos'
        else:
            results = 'neg'
        if (results == label):
            accurate_count += 1
        sent_testsets[results].add(i)
    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    sent_accuracy = float(accurate_count) / len(test_sent_feats)

    neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
    neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
    neg_fmeasure =  nltk.metrics.f_measure(refsets['neg'], testsets['neg'])

    pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
    pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
    pos_fmeasure =  nltk.metrics.f_measure(refsets['pos'], testsets['pos'])

    sent_neg_precision = nltk.metrics.precision(sent_refsets['neg'], sent_testsets['neg'])
    sent_neg_recall = nltk.metrics.recall(sent_refsets['neg'], sent_testsets['neg'])
    sent_neg_fmeasure = nltk.metrics.f_measure(sent_refsets['neg'], sent_testsets['neg'])

    sent_pos_precision = nltk.metrics.precision(sent_refsets['pos'], sent_testsets['pos'])
    sent_pos_recall = nltk.metrics.recall(sent_refsets['pos'], sent_testsets['pos'])
    sent_pos_fmeasure = nltk.metrics.f_measure(sent_refsets['pos'], sent_testsets['pos'])

    print ''
    print '---------------------------------------'
    print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
    print '---------------------------------------'
    print 'accuracy:', accuracy
    print 'precision', (pos_precision + neg_precision) / 2
    print 'recall', (pos_recall + neg_recall) / 2
    print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2
    print ''

    print ''
    print '---------------------------------------'
    print 'SENTENCES: SINGLE FOLD RESULT ' + '(' + classifierName + ')'
    print '---------------------------------------'
    print 'accuracy:', sent_accuracy
    print 'precision', (sent_pos_precision + sent_neg_precision) / 2
    print 'recall', (sent_pos_recall + sent_neg_recall) / 2
    print 'f-measure', (sent_pos_fmeasure + sent_neg_fmeasure) / 2
    print ''
 
    # CROSS VALIDATION

    trainfeats = neg_feats + pos_feats
    test_sent_feats = neg_sent_feats + pos_sent_feats
    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    random.shuffle(test_sent_feats)
    n = 5  # 5-fold cross-validation

    subset_size = len(trainfeats) / n
    accuracy = []
    sent_accuracy = []
    neg_precision = []
    neg_recall = []
    pos_precision = []
    pos_recall = []
    neg_fmeasure = []
    pos_fmeasure = []
    sent_neg_precision = []
    sent_neg_recall = []
    sent_pos_precision = []
    sent_pos_recall = []
    sent_neg_fmeasure = []
    sent_pos_fmeasure = []    
    cv_count = 1

    print 'Starting 5-fold cross validation...'
    for i in range(n):
        print "Fold " + str(i) + ":"
        testing_this_round = trainfeats[i * subset_size:][:subset_size]
        sent_testing_this_round = test_sent_feats[i * subset_size:][:subset_size]
        training_this_round = trainfeats[:i * subset_size] + trainfeats[(i + 1) * subset_size:]

        classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None,
                                            sparse=True, gaussian_prior_sigma=0, max_iter=1)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        sent_refsets = collections.defaultdict(set)
        sent_testsets = collections.defaultdict(set)

        print "Testing..."

        for i, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        print "Testing on Sentences..."

        accuracy_count = 0
        for i, (sentences, label) in enumerate(sent_testing_this_round):
            sent_refsets[label].add(i)
            pos_prob_total = 0
            neg_prob_total = 0
            sentence_count = len(sentences)
            for sentence in sentences:
                pdist = classifier.prob_classify(featx(sentence, 4))
                pos_prob_total += pdist.prob('pos')
                neg_prob_total += pdist.prob('neg')
            if ((pos_prob_total / sentence_count) > (neg_prob_total / sentence_count)):
                results = 'pos'
            else:
                results = 'neg'
            if (results == label):
                accuracy_count += 1
            sent_testsets[results].add(i)            

        cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
        sent_cv_accuracy = float(accuracy_count) / len(sent_testing_this_round)

        cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
        cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
        cv_neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
        cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
        cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
        cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])

        sent_cv_neg_precision = nltk.metrics.precision(sent_refsets['neg'], sent_testsets['neg'])
        sent_cv_neg_recall = nltk.metrics.recall(sent_refsets['neg'], sent_testsets['neg'])
        sent_cv_neg_fmeasure = nltk.metrics.f_measure(sent_refsets['neg'], sent_testsets['neg'])
        sent_cv_pos_precision = nltk.metrics.precision(sent_refsets['pos'], sent_testsets['pos'])
        sent_cv_pos_recall = nltk.metrics.recall(sent_refsets['pos'], sent_testsets['pos'])
        sent_cv_pos_fmeasure = nltk.metrics.f_measure(sent_refsets['pos'], sent_testsets['pos'])

        accuracy.append(cv_accuracy)
        sent_accuracy.append(sent_cv_accuracy)

        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)
        neg_fmeasure.append(cv_neg_fmeasure)
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        pos_fmeasure.append(cv_pos_fmeasure)

        sent_neg_precision.append(sent_cv_neg_precision)
        sent_neg_recall.append(sent_cv_neg_recall)
        sent_neg_fmeasure.append(sent_cv_neg_fmeasure)
        sent_pos_precision.append(sent_cv_pos_precision)
        sent_pos_recall.append(sent_cv_pos_recall)
        sent_pos_fmeasure.append(sent_cv_pos_fmeasure)

        cv_count += 1

    print '---------------------------------------'
    print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
    print '---------------------------------------'
    print 'accuracy:', sum(accuracy) / n
    print 'precision', (sum(neg_precision) / n + sum(pos_precision) / n) / 2
    print 'recall', (sum(neg_recall) / n + sum(pos_recall) / n) / 2
    print 'f-measure', (sum(neg_fmeasure) / n + sum(pos_fmeasure) / n) / 2
    print ''

    print '---------------------------------------'
    print 'SENTENCES: N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
    print '---------------------------------------'
    print 'accuracy:', sum(sent_accuracy) / n
    print 'precision', (sum(sent_neg_precision) / n + sum(sent_pos_precision) / n) / 2
    print 'recall', (sum(sent_neg_recall) / n + sum(sent_pos_recall) / n) / 2
    print 'f-measure', (sum(sent_neg_fmeasure) / n + sum(sent_pos_fmeasure) / n) / 2
    print ''
Example #56
0
def uni_and_bi_validation(lines):
    """
    + plots the classification F1-measure using bigrams and unigrams
    + prints a table containing the max accuracy and F1-measure obtained and the number of feature reached at
    :param lines: list of tweets
    :return:
    """
    accuracy_list_nb = []
    f_measure_list_nb = []
    accuracy_list_svm = []
    f_measure_list_svm = []
    accuracy_list_maxent = []
    f_measure_list_maxent = []


    random.shuffle(lines)

    hashtag_list = PatternsFeatures().get_most_frequent_pattern(PatternsFeatures().pattern_classifier(lines, '#'))
    name_list = PatternsFeatures().get_most_frequent_pattern(PatternsFeatures().pattern_classifier(lines, '@'))

    train_set_rate = int(len(lines)*0.75)
    train_set, test_set = lines[:train_set_rate], lines[train_set_rate:]
    all_tweets = " ".join([" ".join(line[1]) for line in train_set])

    ftr2 = FeatureExtraction(20)
    ftr2.most_frequent_bigrams(all_tweets)

    bigram_featuresets_test = [(ftr2.bigram_features(line[1]), line[0]) for line in test_set]
    bigram_featuresets_train = [(ftr2.bigram_features(line[1]), line[0]) for line in train_set]



    for i in range(10, 200, 20):
        ftr = FeatureExtraction(i)

        ftr.most_frequent_unigrams(all_tweets)

        for hashtag in hashtag_list:
            ftr.set_unigram_features_list(hashtag)
        for name in name_list:
            ftr.set_unigram_features_list(name)


        unigram_featuresets_test = [(ftr.unigram_features(line[1]), line[0]) for line in test_set]
        unigram_featuresets_train = [(ftr.unigram_features(line[1]), line[0]) for line in train_set]

        featuresets_test = bigram_featuresets_test + unigram_featuresets_test
        featuresets_train = bigram_featuresets_train + unigram_featuresets_train



##############################################################################


        classifier1 = NaiveBayesClassifier.train(featuresets_train)
        classifier2 = MaxentClassifier.train(featuresets_train)
        classifier3 = nltk.classify.SklearnClassifier(LinearSVC())
        classifier3.train(featuresets_train)


        refsets = collections.defaultdict(set)
        testsets1 = collections.defaultdict(set)
        testsets2 = collections.defaultdict(set)
        testsets3 = collections.defaultdict(set)

        for i, (feats, label) in enumerate(featuresets_test):
            refsets[label].add(i)
            observed1 = classifier1.classify(feats)
            observed2 = classifier2.classify(feats)
            observed3 = classifier3.classify(feats)
            testsets1[observed1].add(i)
            testsets2[observed2].add(i)
            testsets3[observed3].add(i)

        accuracy_list_nb.append(nltk.classify.accuracy(classifier1, featuresets_test))
        f_measure_list_nb.append(nltk.metrics.f_measure(refsets['not'], testsets1['not']))
        accuracy_list_svm.append(nltk.classify.accuracy(classifier3, featuresets_test))
        f_measure_list_svm.append(nltk.metrics.f_measure(refsets['not'], testsets3['not']))
        accuracy_list_maxent.append(nltk.classify.accuracy(classifier2, featuresets_test))
        f_measure_list_maxent.append(nltk.metrics.f_measure(refsets['not'], testsets2['not']))


 ################################################################################

    print "+-----------------------------------------------------------------+"
    print "\t\t\t\t\tbigram and unigram classification measurements"
    print "+-----------------------------------------------------------------+"
    print "\t\t\t\t\t\t\tmax accuracy \t number of features "
    print "Naive Bayes\t\t\t\t\t %f \t\t\t\t%d" % (max(accuracy_list_nb), (accuracy_list_nb.index(max(accuracy_list_nb))*20)+10)
    print "Maximum entropy\t\t\t\t %f \t\t\t\t%d" % (max(accuracy_list_maxent), (accuracy_list_maxent.index(max(accuracy_list_maxent))*20)+10)
    print "Support Vector Machine\t\t %f \t\t\t\t%d" % (max(accuracy_list_svm), (accuracy_list_svm.index(max(accuracy_list_svm))*20)+10)
    print "+-----------------------------------------------------------------+"
    print "+-----------------------------------------------------------------+"
    print "\t\t\t\t\t\t\tmax f-measure \t number of features "
    print "Naive Bayes\t\t\t\t\t %f \t\t\t\t%d" % (max(f_measure_list_nb), (f_measure_list_nb.index(max(f_measure_list_nb))*20)+10)
    print "Maximum entropy\t\t\t\t %f \t\t\t\t%d" % (max(f_measure_list_maxent), (f_measure_list_maxent.index(max(f_measure_list_maxent))*20)+10)
    print "Support Vector Machine\t\t %f \t\t\t\t%d" % (max(f_measure_list_svm), (f_measure_list_svm.index(max(f_measure_list_svm))+1)*20)
    print "+-----------------------------------------------------------------+"
################################################################################

    print " time taken for the classification process %f sec " % (time() - t0)
#####################################################################################################
    x_axis = [i for i in range(10, 200, 20)]
    plt.figure(facecolor='white')
    fig1, = plt.plot(x_axis, accuracy_list_nb, 'r*-', label='Naive bayes accuracy')
    fig2, = plt.plot(x_axis, f_measure_list_nb, 'ro-', label='Naive bayes f-measure')
    fig3, = plt.plot(x_axis, accuracy_list_svm, 'g*-', label='SVM accuracy')
    fig4, = plt.plot(x_axis, f_measure_list_svm, 'go-', label='SVM f-measure')
    fig5, = plt.plot(x_axis, accuracy_list_maxent, '*-', label='max Entropy accuracy')
    fig6, = plt.plot(x_axis, f_measure_list_maxent, 'o-', label='max Entropy f-measure')

    plt.xlabel('Number of features')
    plt.ylabel('Results')
    plt.title('Results of the classification using unigrams and bigrams')
    plt.legend(handles=[fig1, fig2, fig3, fig4, fig5, fig6], loc=4)
    plt.show()
Example #57
0
 def maxent_classify (sentence):
     
     features = extract_features(sentence)
     probdist = MaxentClassifier.classify(self, features)
     print probdist
     return probdist
def word_features(words):
    return dict([(word, True) for word in words])


# Get all the movie reviews with positive data set and negative data set
posRev = movie_reviews.fileids('pos')
negRev = movie_reviews.fileids('neg')

# Mark the words in data set as positive and negative:
posWords = [(word_features(movie_reviews.words(fileids=[f])), 'pos') for f in posRev]
negWords = [(word_features(movie_reviews.words(fileids=[f])), 'neg') for f in negRev]

# Set cut off for separating the training data and the testing data:
posCutoff = len(posWords) * 50 / 100
negCutoff = len(negWords) * 50 / 100

# Fill the training data and the testing data with positive and negative data set:
TestRev = posWords[posCutoff:] + negWords[negCutoff:]
Test_set = array(TestRev)
TrainRev = posWords[:posCutoff] + negWords[:negCutoff]
Train_set = array(TrainRev)
print 'train on %d instances, test on %d instances' % (len(Train_set), len(Test_set))

# Call Maximum Entropy classifier to classify the training data:
algo = MaxentClassifier.ALGORITHMS[0]
classifier = MaxentClassifier.train(Train_set, algorithm=algo, max_iter=3)
classifier.show_most_informative_features(10)

# Print the algorithm accuracy
print 'Accuracy is', util.accuracy(classifier, Test_set)