Example #1
0
 def __init__(self, train_sents=None, pickle_name=None, save=False):
     '''Trains new tagger model or loads existing one.'''
     self._pickle_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), 'pickles', pickle_name)
     if train_sents == None:
         # Load exisiting model
         with open(self._pickle_path, 'rb') as f:
             self.classifier = pickle.load(f)
     else:
         train_set = []
         for tagged_sent in train_sents:
             untagged_sent = nltk.tag.untag(tagged_sent)
             history = []
             for i, (word, t) in enumerate(tagged_sent):
                 featureset = self.npchunk_features(untagged_sent, i,
                                                    history)
                 train_set.append((featureset, t))
                 history.append(t)
         nltk.config_megam(r'c:\progs\megam\megam.exe')
         self.classifier = nltk.MaxentClassifier.train(train_set,
                                                       algorithm='megam',
                                                       trace=0)
         if save:
             # save newly trained model with name specified
             with open(self._pickle_path, 'wb') as f:
                 pickle.dump(self.classifier, f)
def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    tweets = classify.get_tweets_to_classify(conn_analysis)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = classifier.classify(classify.process_tweet(text))
        update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print count_table
Example #3
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}


	test_tweets = classify.get_test_tweets(conn_analysis)
	test_feature_set = process_tweets(test_tweets)

	classifier.show_most_informative_features(10)
	classifier_accuracy = accuracy(classifier, test_feature_set)
	print "classifier accuracy: " + repr(classifier_accuracy)
def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    classifier.show_most_informative_features(10)

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    print "classifier accuracy: " + repr(classifier_accuracy)
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn_analysis);

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print count_table
def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    #classifier.show_most_informative_features(50, show='pos')
    #classifier.show_most_informative_features(50, show='neg')

    #classifier.explain(training_feature_set[0][0])
    #print training_feature_set[0]

    error_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    count_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    guess_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = classify.process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #print count_table
    print "classifier accuracy: " + repr(classifier_accuracy)
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_tweet_polarity_ensemble(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Maximum Entropy"
	print count_table

	#generate the accuracy matrix
	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	for tweet in test_tweets:
		result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
		guess = result[0][0]

		actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0])
		actual = actual_result[0][0]

		if guess is not None:
			if actual is not None:
				full_matrix[actual][guess] += 1

	print full_matrix
def demo():            
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)
Example #9
0
def demo():
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)
Example #10
0
 def __init__(self, tagger, chunked_sents):
     tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in chunked_sents]
     train_set = []
     for tagged_sent in tagged_sents:
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             featureset = npchunk_features(untagged_sent, i, history)
             train_set.append((featureset, tag))
             history.append(tag)
     labels = set(label for (tok, label) in train_set)
     nltk.config_megam("./megam_i686.opt")
     self.classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
     self.tagger = tagger
Example #11
0
    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)
def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    training_tweets = get_test_tweets(conn)
    #training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    print "configuring megam"
    config_megam('/opt/packages')
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    #			'-':{'+':0, '-':0, 'I':0, 'O':0},
    #			'I':{'+':0, '-':0, 'I':0, 'O':0},
    #			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    #for f in test_tweets:
    #	guess = classifier.classify(process_tweet(f[1]))
    #	full_matrix[f[2]][guess] += 1

    #print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)
Example #13
0
def main_function():
    conn_analysis = MySQLdb.connect(host="localhost",
                                    user="******",
                                    passwd="tanzania",
                                    db="twitter_heart")

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    tweets = classify.get_tweets_to_classify(conn_analysis)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_max_ent_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print "Maximum Entropy"
    print count_table
Example #14
0
    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(
            os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set,
                                                      algorithm='megam',
                                                      trace=0)
def main_function():
    conn = MySQLdb.connect(
        host=DATABASES["date_cutoff"]["HOST"],
        user=DATABASES["date_cutoff"]["USER"],
        passwd=DATABASES["date_cutoff"]["PASSWORD"],
        db=DATABASES["date_cutoff"]["NAME"],
    )

    training_tweets = get_test_tweets(conn)
    # training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, "+", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "-", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "I", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "O", total_word_count, best_words)

    print "configuring megam"
    config_megam("/opt/packages")
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'-':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'I':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    # for f in test_tweets:
    # 	guess = classifier.classify(process_tweet(f[1]))
    # 	full_matrix[f[2]][guess] += 1

    # print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)
def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	total_word_count = total_words(conn)
	training_feature_set = process_bigrams(conn, '+', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, '-', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'I', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'O', total_word_count, best_words)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	classifier.show_most_informative_features(10)

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	print "classifier accuracy: " + repr(classifier_accuracy)
def nltk_maxent_pos_tagger(input_dict):
    name = 'MaxentPosTagger'
    if not input_dict['training_corpus']:
        maxent_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
        name += '-pretrained'
    else:
        nltk.config_megam(settings.MEGAM_EXECUTABLE_PATH)

        maxent_tagger = MaxentPosTagger()
        chunk = input_dict['training_corpus']['chunk']
        corpus = input_dict['training_corpus']['corpus']
        training_corpus=corpus_reader(corpus, chunk)
        if training_corpus:
            maxent_tagger.train(training_corpus)
        else:
            raise AttributeError

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': maxent_tagger,
                'name': name
            }
    }
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	#classifier.show_most_informative_features(50, show='pos')
	#classifier.show_most_informative_features(50, show='neg')

	#classifier.explain(training_feature_set[0][0])
	#print training_feature_set[0]

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = classify.process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	#print count_table
	print "classifier accuracy: " + repr(classifier_accuracy)
def main_function():
	conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart")

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	tweets = classify.get_tweets_to_classify(conn_analysis);

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_max_ent_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print "Maximum Entropy"
	print count_table
Example #20
0
# an implementation of the probabalistic chunker from NLTK
import nltk
import cPickle
nltk.config_megam('/home/chris/programs/megam_0.92/megam')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,
most_suggestive_words = [w.rstrip() for w in words]

f = open('../../data_files/yelp_sent_pos_text.txt')
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
	tips_pos.append( eval(line) )

print 'Building feature set...', datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
	features = doc_word_presence(tip)
	featuresets.append( (features, tag) )
	count += 1
	if count == int(len(tips_pos)):
		break

size = int(len(featuresets)/2)
train_set, test_set = featuresets[:size], featuresets[size:]
print 'Training classifier...', datetime.datetime.now()
#classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam('/Users/admin/Downloads/megam_0.92/megam')
classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam')
print 'Finished training classifier', datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, '../../results/prob_dist/top_word_presence.txt')
# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}
Example #23
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import MySQLdb as mdb
import sys
import re
from nltk import config_megam
from nltk import MaxentClassifier
from nltk import classify
from numpy import array_split
import os.path

config_megam('/var/www/test/tagging/tools/db/tags/classifier/MEGAM/megam-64.opt')

algorithm = 'MEGAM'



def getClassesDict():
    f = open('hindiclasses.sorted.txt')
    classesDict = dict()
    for line in f:
        fields = line.split()
        classesDict[fields[0]] = fields[1]
    f.close()
    return classesDict

classesDict = getClassesDict();


def extractWordFeature(wordDict, sentence, i):
most_suggestive_words = most_freq_pos_words.union(most_freq_neg_words).union(most_freq_unk_words)

f = open("../../data_files/yelp_sent_pos_text.txt")
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
    tips_pos.append(eval(line))

print "Building feature set...", datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
    features = doc_word_presence(tip)
    featuresets.append((features, tag))
    count += 1
    if count == int(len(tips_pos)):
        break

size = int(len(featuresets) / 2)
train_set, test_set = featuresets[:size], featuresets[size:]
print "Training classifier...", datetime.datetime.now()
# classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam("/Users/admin/Downloads/megam_0.92/megam")
classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
print "Finished training classifier", datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, "../../results/prob_dist/top_word_presence.txt")
Example #25
0
#!/usr/bin/python

import nltk
#nltk.config_megam('/home/aritter/local/bin/megam')
nltk.config_megam('/usr/local/bin/megam')

import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
Example #26
0
    Training: ClassifierTraining
    Prediction: Classifier
"""
import sys
sys.path.append('../')
import argparse
import cPickle as p
import operator
import os
import utils

from ERG import AMR

import nltk
from nltk.classify import MaxentClassifier, accuracy
nltk.config_megam("/usr/local/bin/megam.opt")
from scipy.stats import rankdata


class ClassifierTraining(object):
    def __init__(self, ftrain, fdev, ftest, wdir, delexicalized=True):
        self.train_amrs, self.dev_amrs, self.test_amrs = [], [], []
        self.delexicalized = delexicalized

        print 'PARSING...'
        for f in os.listdir(ftrain):
            self.train_amrs.extend(self.parse(os.path.join(ftrain, f)))

        for f in os.listdir(fdev):
            self.dev_amrs.extend(self.parse(os.path.join(fdev, f)))
Example #27
0
#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')

import sys
import math
import random

MAX_NGRAM = 6


class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1, MAX_NGRAM):
                for i in range(len(words) - gram + 1):
                    ngram = " ".join(words[i:i + gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
Example #28
0
import nltk
import os
from cPickle import load, dump

from nltk.corpus import conll2000

from settings import ROOT

chunker_path = ROOT + 'vendor/parsers/consecutive_np_chunker.pk1'
megam_path = ROOT + 'vendor/megam_i686.opt'
nltk.config_megam(ROOT + 'vendor/megam_i686.opt')

# Natural Language Toolkit: code_classifier_chunker
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {'pos': pos, 'word': word, 'prevpos': prevpos}


class ConsecutiveNPChunkTagger(nltk.TaggerI): # [_consec-chunk-tagger]

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) # [_consec-use-fe]
Example #29
0
# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,
Example #30
0
 def test_rte_classification_with_megam(self):
     try:
         config_megam()
     except (LookupError, AttributeError) as e:
         pytest.skip("Skipping tests with dependencies on MEGAM")
     clf = rte_classifier("megam", sample_N=100)
import time
import re
from collections import defaultdict

from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier

PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam"
config_megam(PATH_TO_MEGAM_EXECUTABLE)


class MaxentPosTagger(TaggerI):
    def train(self,
              train_sents,
              algorithm='megam',
              rare_word_cutoff=5,
              rare_feat_cutoff=5,
              uppercase='[A-Z]',
              trace=3,
              **cutoffs):

        self.uppercase = uppercase
        self.word_freqdist = self.word_freqs(train_sents)
        self.featuresets = self.featsets(train_sents, rare_word_cutoff)
        self.features_freqdist = self.gen_the_feat_freqs(self.featuresets)
        self.cut_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()
import time
import collections
import pickle
from nltk.corpus import stopwords
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from svmutil import *
import itertools
import math

def remove_stopwords(text):
    r = stopwords.words('english')
    r.append('rt')
    return [w for w in text if not w in r]

nltk.config_megam('.')

start_time = time.time()
connection = MongoClient('localhost', 27017)
db = connection.local
sad_col = db['neg_emoticons']
hap_col = db['pos_emoticons']

h, s = [], []
s = sad_col.find()
h = hap_col.find()

pos_tweets, neg_tweets = [], []

if len(sys.argv) > 2:
    count = int(sys.argv[2]) / 2
Example #33
0
def active_megam():
	if nltk.megam._megam_bin is None:
		import os
		path = os.getcwd()
		nltk.config_megam(path+'/megam/megam-64.opt')
def main_function():
    conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'],
                           user=DATABASES['ensemble']['USER'],
                           passwd=DATABASES['ensemble']['PASSWORD'],
                           db=DATABASES['ensemble']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_tweet_polarity_ensemble(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Maximum Entropy"
    print count_table

    #generate the accuracy matrix
    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    for tweet in test_tweets:
        result = classify.run_sql(
            conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
        guess = result[0][0]

        actual_result = classify.run_sql(
            conn, classify.Statements.CHECK_MAJORITY % tweet[0])
        actual = actual_result[0][0]

        if guess is not None:
            if actual is not None:
                full_matrix[actual][guess] += 1

    print full_matrix
Example #35
0
#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')


import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
Example #36
0
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', None)
        df = pd.DataFrame(confusionMatrix)
        df.columns = TAGS
        df.index = TAGS
        df.style
        print(df)
        print("Accuracy :", 100 * corr / total)


if __name__ == "__main__":

    nltk.download("conll2000")
    nltk.config_megam('./megam-64.opt')
    corpus_train = nltk.corpus.conll2000.chunked_sents('train.txt')
    corpus_test = nltk.corpus.conll2000.chunked_sents("test.txt")

    def preprocess(sent):
        tgs = nltk.tree2conlltags(sent)
        tgs = [(w, pos, t[0]) for w, pos, t in tgs]
        return tgs

    TRAIN_DATA = [preprocess(sent) for sent in corpus_train]
    TEST_DATA = [preprocess(sent) for sent in corpus_test]

    cP = ChunkTagger(TRAIN_DATA)
    cP.evaluate(TEST_DATA)

    # TRAIN_DATA_FILE = open("../assignment2dataset/train.txt","r")
Example #37
0
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) for (w, t, c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.util.conlltags2tree(conlltags)


if __name__ == '__main__':
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    #unigram_chunker = BigramChunker(train_sents)
    # unigram_chunker = UnigramChunker(train_sents)
    #print unigram_chunker.evaluate(test_sents)
    nltk.config_megam(bin='/home/alpha/work/env/megam_0.92/./megam')
    chunker = ConsecutiveNPChunker(train_sents)
    print chunker.evaluate(test_sents)
Example #38
0
import nltk
import collections
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk import config_megam
import re
import numpy as np

#nltk.download('punkt')
PATH_TO_MEGAM_EXECUTABLE = "./MEGAM/megam-64"
config_megam(PATH_TO_MEGAM_EXECUTABLE)

def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger

def get_ner_items():
    items = []
    with open("3c.txt", 'r') as f: 
        for line in f: 
            items.append(word_tokenize(line[:-1]))
    return items

def get_ner_words(ner_items):
    ner_words = set()
    for ner_item in ner_items:
        for ner_word in ner_item:
    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')


# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
        return '+'.join(sorted(tags))


test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

#train and saving the classifier
nltk.config_megam('megam.exe')
chunker = ConsecutiveNPChunker(train_sents)

from pickle import dump
output = open("chunkModelSVMFeat1.pkl", "wb")
dump(chunker, output, -1)
output.close()
results = chunker.evaluate(test_sents)

print()
#print(results.incorrect())
print("-----------------------------")
#print(results.missed())
print("----------------------------")
#print(chunker.tagger.classifier.show_most_informative_features(n=20,show="all"))
#print(chunker.classifier.explain(featureset,columns=4))
        for sentence_tree in tree:
            eval_corpus.append(split_tree_tokens(sentence_tree))
    # Evaluate model
    print 'Evaluating...'
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_corpus):
        guessed = chunker.parse(correct.leaves())
        guessed = chunker._parse_to_tagged(guessed)
        chunkscore.score(correct, guessed)
        if i < 3:
            cmp_chunks(correct, guessed)
    print chunkscore
    return chunker


_EVENT_TRAIN_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/train')
                                              
_EVENT_EVAL_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/eval')

config_megam(resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/megam_i686.opt'))


if __name__ == "__main__":
    chunker = build_event_chunking_model()
    nltk.download()

in the Python interpreter. Proper usage of demo() and all other functions and
methods is described below.
"""

import time
import re
import pickle

from collections import defaultdict
from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier
from nltk.corpus.reader.conll import ConllCorpusReader

config_megam('/home/dsbatista/megam_i686.opt')


class MaxentPosTagger(TaggerI):
    """
    MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models.
    """
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        """
        MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
        sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences. Each sentence is
Example #43
0
    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')

# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged

        """