Python config_megam Examples, nltk.config_megam Python Examples

Example #1

0

Show file

 def __init__(self, train_sents=None, pickle_name=None, save=False):
     '''Trains new tagger model or loads existing one.'''
     self._pickle_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), 'pickles', pickle_name)
     if train_sents == None:
         # Load exisiting model
         with open(self._pickle_path, 'rb') as f:
             self.classifier = pickle.load(f)
     else:
         train_set = []
         for tagged_sent in train_sents:
             untagged_sent = nltk.tag.untag(tagged_sent)
             history = []
             for i, (word, t) in enumerate(tagged_sent):
                 featureset = self.npchunk_features(untagged_sent, i,
                                                    history)
                 train_set.append((featureset, t))
                 history.append(t)
         nltk.config_megam(r'c:\progs\megam\megam.exe')
         self.classifier = nltk.MaxentClassifier.train(train_set,
                                                       algorithm='megam',
                                                       trace=0)
         if save:
             # save newly trained model with name specified
             with open(self._pickle_path, 'wb') as f:
                 pickle.dump(self.classifier, f)

Example #2

0

Show file

File: full-dataset-max-ent.py Project: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    tweets = classify.get_tweets_to_classify(conn_analysis)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = classifier.classify(classify.process_tweet(text))
        update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print count_table

Example #3

0

Show file

def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}


	test_tweets = classify.get_test_tweets(conn_analysis)
	test_feature_set = process_tweets(test_tweets)

	classifier.show_most_informative_features(10)
	classifier_accuracy = accuracy(classifier, test_feature_set)
	print "classifier accuracy: " + repr(classifier_accuracy)

Example #4

0

Show file

File: max-ent-sig-words.py Project: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    classifier.show_most_informative_features(10)

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    print "classifier accuracy: " + repr(classifier_accuracy)

Example #5

0

Show file

File: full-dataset-max-ent.py Project: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn_analysis);

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print count_table

Example #6

0

Show file

File: max-ent-refactored.py Project: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    #classifier.show_most_informative_features(50, show='pos')
    #classifier.show_most_informative_features(50, show='neg')

    #classifier.explain(training_feature_set[0][0])
    #print training_feature_set[0]

    error_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    count_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    guess_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = classify.process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #print count_table
    print "classifier accuracy: " + repr(classifier_accuracy)

Example #7

0

Show file

File: a-ensemble-bayes-max-ent.py Project: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_tweet_polarity_ensemble(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Maximum Entropy"
	print count_table

	#generate the accuracy matrix
	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	for tweet in test_tweets:
		result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
		guess = result[0][0]

		actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0])
		actual = actual_result[0][0]

		if guess is not None:
			if actual is not None:
				full_matrix[actual][guess] += 1

	print full_matrix

Example #8

0

Show file

File: rte_classify.py Project: LowResourceLanguages/hltdi-l3

def demo():            
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)

Example #9

0

Show file

File: rte_classify.py Project: chethankumarka/SuaaS

def demo():
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)

Example #10

0

Show file

File: NERDb.py Project: gabsl/IMDBot

 def __init__(self, tagger, chunked_sents):
     tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in chunked_sents]
     train_set = []
     for tagged_sent in tagged_sents:
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             featureset = npchunk_features(untagged_sent, i, history)
             train_set.append((featureset, tag))
             history.append(tag)
     labels = set(label for (tok, label) in train_set)
     nltk.config_megam("./megam_i686.opt")
     self.classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
     self.tagger = tagger

Example #11

0

Show file

File: chunk_tagger.py Project: Mo-Talha/Nomad

    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)

Example #12

0

Show file

File: max-ent-sig-words-bigrams.py Project: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    training_tweets = get_test_tweets(conn)
    #training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    print "configuring megam"
    config_megam('/opt/packages')
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    #			'-':{'+':0, '-':0, 'I':0, 'O':0},
    #			'I':{'+':0, '-':0, 'I':0, 'O':0},
    #			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    #for f in test_tweets:
    #	guess = classifier.classify(process_tweet(f[1]))
    #	full_matrix[f[2]][guess] += 1

    #print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)

Example #13

0

Show file

def main_function():
    conn_analysis = MySQLdb.connect(host="localhost",
                                    user="******",
                                    passwd="tanzania",
                                    db="twitter_heart")

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    tweets = classify.get_tweets_to_classify(conn_analysis)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_max_ent_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print "Maximum Entropy"
    print count_table

Example #14

0

Show file

File: chunk_tagger.py Project: jizhihang/Dropple

    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(
            os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set,
                                                      algorithm='megam',
                                                      trace=0)

Example #15

0

Show file

File: max-ent-sig-words-bigrams.py Project: khandelwal/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(
        host=DATABASES["date_cutoff"]["HOST"],
        user=DATABASES["date_cutoff"]["USER"],
        passwd=DATABASES["date_cutoff"]["PASSWORD"],
        db=DATABASES["date_cutoff"]["NAME"],
    )

    training_tweets = get_test_tweets(conn)
    # training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, "+", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "-", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "I", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "O", total_word_count, best_words)

    print "configuring megam"
    config_megam("/opt/packages")
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'-':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'I':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    # for f in test_tweets:
    # 	guess = classifier.classify(process_tweet(f[1]))
    # 	full_matrix[f[2]][guess] += 1

    # print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)

Example #16

0

Show file

File: max-ent-sig-words.py Project: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	total_word_count = total_words(conn)
	training_feature_set = process_bigrams(conn, '+', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, '-', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'I', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'O', total_word_count, best_words)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	classifier.show_most_informative_features(10)

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	print "classifier accuracy: " + repr(classifier_accuracy)

Example #17

0

Show file

File: part_of_speech_tagging.py Project: BillTheBest/tf_core

def nltk_maxent_pos_tagger(input_dict):
    name = 'MaxentPosTagger'
    if not input_dict['training_corpus']:
        maxent_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
        name += '-pretrained'
    else:
        nltk.config_megam(settings.MEGAM_EXECUTABLE_PATH)

        maxent_tagger = MaxentPosTagger()
        chunk = input_dict['training_corpus']['chunk']
        corpus = input_dict['training_corpus']['corpus']
        training_corpus=corpus_reader(corpus, chunk)
        if training_corpus:
            maxent_tagger.train(training_corpus)
        else:
            raise AttributeError

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': maxent_tagger,
                'name': name
            }
    }

Example #18

0

Show file

File: max-ent-refactored.py Project: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	#classifier.show_most_informative_features(50, show='pos')
	#classifier.show_most_informative_features(50, show='neg')

	#classifier.explain(training_feature_set[0][0])
	#print training_feature_set[0]

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = classify.process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	#print count_table
	print "classifier accuracy: " + repr(classifier_accuracy)

Example #19

0

Show file

File: full-dataset-ensemble-a.py Project: 7andrew7/vaccine-sentiment

def main_function():
	conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart")

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	tweets = classify.get_tweets_to_classify(conn_analysis);

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_max_ent_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print "Maximum Entropy"
	print count_table

Example #20

0

Show file

# an implementation of the probabalistic chunker from NLTK
import nltk
import cPickle
nltk.config_megam('/home/chris/programs/megam_0.92/megam')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,

Example #21

0

Show file

File: word_presence_thru_dist.py Project: srniranjan/sentiment_analysis_spikes

most_suggestive_words = [w.rstrip() for w in words]

f = open('../../data_files/yelp_sent_pos_text.txt')
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
	tips_pos.append( eval(line) )

print 'Building feature set...', datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
	features = doc_word_presence(tip)
	featuresets.append( (features, tag) )
	count += 1
	if count == int(len(tips_pos)):
		break

size = int(len(featuresets)/2)
train_set, test_set = featuresets[:size], featuresets[size:]
print 'Training classifier...', datetime.datetime.now()
#classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam('/Users/admin/Downloads/megam_0.92/megam')
classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam')
print 'Finished training classifier', datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, '../../results/prob_dist/top_word_presence.txt')

Example #22

0

Show file

File: code_classifier_chunker.py Project: nishantagarwal1990/QASystem

# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}

Example #23

0

Show file

#!/usr/bin/python
# -*- coding: utf-8 -*-

import MySQLdb as mdb
import sys
import re
from nltk import config_megam
from nltk import MaxentClassifier
from nltk import classify
from numpy import array_split
import os.path

config_megam('/var/www/test/tagging/tools/db/tags/classifier/MEGAM/megam-64.opt')

algorithm = 'MEGAM'



def getClassesDict():
    f = open('hindiclasses.sorted.txt')
    classesDict = dict()
    for line in f:
        fields = line.split()
        classesDict[fields[0]] = fields[1]
    f.close()
    return classesDict

classesDict = getClassesDict();


def extractWordFeature(wordDict, sentence, i):

Example #24

0

Show file

File: top_word_presence.py Project: srniranjan/sentiment_analysis_spikes

most_suggestive_words = most_freq_pos_words.union(most_freq_neg_words).union(most_freq_unk_words)

f = open("../../data_files/yelp_sent_pos_text.txt")
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
    tips_pos.append(eval(line))

print "Building feature set...", datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
    features = doc_word_presence(tip)
    featuresets.append((features, tag))
    count += 1
    if count == int(len(tips_pos)):
        break

size = int(len(featuresets) / 2)
train_set, test_set = featuresets[:size], featuresets[size:]
print "Training classifier...", datetime.datetime.now()
# classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam("/Users/admin/Downloads/megam_0.92/megam")
classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
print "Finished training classifier", datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, "../../results/prob_dist/top_word_presence.txt")

Example #25

0

Show file

#!/usr/bin/python

import nltk
#nltk.config_megam('/home/aritter/local/bin/megam')
nltk.config_megam('/usr/local/bin/megam')

import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1

Example #26

0

Show file

File: classifier.py Project: ThiagoCF05/LinearAMR

    Training: ClassifierTraining
    Prediction: Classifier
"""
import sys
sys.path.append('../')
import argparse
import cPickle as p
import operator
import os
import utils

from ERG import AMR

import nltk
from nltk.classify import MaxentClassifier, accuracy
nltk.config_megam("/usr/local/bin/megam.opt")
from scipy.stats import rankdata


class ClassifierTraining(object):
    def __init__(self, ftrain, fdev, ftest, wdir, delexicalized=True):
        self.train_amrs, self.dev_amrs, self.test_amrs = [], [], []
        self.delexicalized = delexicalized

        print 'PARSING...'
        for f in os.listdir(ftrain):
            self.train_amrs.extend(self.parse(os.path.join(ftrain, f)))

        for f in os.listdir(fdev):
            self.dev_amrs.extend(self.parse(os.path.join(fdev, f)))

Example #27

0

Show file

#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')

import sys
import math
import random

MAX_NGRAM = 6


class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1, MAX_NGRAM):
                for i in range(len(words) - gram + 1):
                    ngram = " ".join(words[i:i + gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1

Example #28

0

Show file

File: code_classifier_chunker.py Project: JoeHill/julian

import nltk
import os
from cPickle import load, dump

from nltk.corpus import conll2000

from settings import ROOT

chunker_path = ROOT + 'vendor/parsers/consecutive_np_chunker.pk1'
megam_path = ROOT + 'vendor/megam_i686.opt'
nltk.config_megam(ROOT + 'vendor/megam_i686.opt')

# Natural Language Toolkit: code_classifier_chunker
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {'pos': pos, 'word': word, 'prevpos': prevpos}


class ConsecutiveNPChunkTagger(nltk.TaggerI): # [_consec-chunk-tagger]

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) # [_consec-use-fe]

Example #29

0

Show file

# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,

Example #30

0

Show file

 def test_rte_classification_with_megam(self):
     try:
         config_megam()
     except (LookupError, AttributeError) as e:
         pytest.skip("Skipping tests with dependencies on MEGAM")
     clf = rte_classifier("megam", sample_N=100)

Example #31

0

Show file

File: memm.py Project: yydjay1100/Code-for-CSCI544-Group-58

import time
import re
from collections import defaultdict

from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier

PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam"
config_megam(PATH_TO_MEGAM_EXECUTABLE)


class MaxentPosTagger(TaggerI):
    def train(self,
              train_sents,
              algorithm='megam',
              rare_word_cutoff=5,
              rare_feat_cutoff=5,
              uppercase='[A-Z]',
              trace=3,
              **cutoffs):

        self.uppercase = uppercase
        self.word_freqdist = self.word_freqs(train_sents)
        self.featuresets = self.featsets(train_sents, rare_word_cutoff)
        self.features_freqdist = self.gen_the_feat_freqs(self.featuresets)
        self.cut_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()

Example #32

0

Show file

File: train_classifier.py Project: Metaphor31/tweet_sentiment

import time
import collections
import pickle
from nltk.corpus import stopwords
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from svmutil import *
import itertools
import math

def remove_stopwords(text):
    r = stopwords.words('english')
    r.append('rt')
    return [w for w in text if not w in r]

nltk.config_megam('.')

start_time = time.time()
connection = MongoClient('localhost', 27017)
db = connection.local
sad_col = db['neg_emoticons']
hap_col = db['pos_emoticons']

h, s = [], []
s = sad_col.find()
h = hap_col.find()

pos_tweets, neg_tweets = [], []

if len(sys.argv) > 2:
    count = int(sys.argv[2]) / 2

Example #33

0

Show file

File: max.py Project: lkfo415579/MAX

def active_megam():
	if nltk.megam._megam_bin is None:
		import os
		path = os.getcwd()
		nltk.config_megam(path+'/megam/megam-64.opt')

Example #34

0

Show file

File: b-ensemble-bayes-max-ent.py Project: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'],
                           user=DATABASES['ensemble']['USER'],
                           passwd=DATABASES['ensemble']['PASSWORD'],
                           db=DATABASES['ensemble']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_tweet_polarity_ensemble(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Maximum Entropy"
    print count_table

    #generate the accuracy matrix
    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    for tweet in test_tweets:
        result = classify.run_sql(
            conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
        guess = result[0][0]

        actual_result = classify.run_sql(
            conn, classify.Statements.CHECK_MAJORITY % tweet[0])
        actual = actual_result[0][0]

        if guess is not None:
            if actual is not None:
                full_matrix[actual][guess] += 1

    print full_matrix

Example #35

0

Show file

File: style_metric.py Project: DL-nisl/Shakespeare

#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')


import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1

Example #36

0

Show file

File: MEMM.py Project: bikky9/Machine-Learning

        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', None)
        df = pd.DataFrame(confusionMatrix)
        df.columns = TAGS
        df.index = TAGS
        df.style
        print(df)
        print("Accuracy :", 100 * corr / total)


if __name__ == "__main__":

    nltk.download("conll2000")
    nltk.config_megam('./megam-64.opt')
    corpus_train = nltk.corpus.conll2000.chunked_sents('train.txt')
    corpus_test = nltk.corpus.conll2000.chunked_sents("test.txt")

    def preprocess(sent):
        tgs = nltk.tree2conlltags(sent)
        tgs = [(w, pos, t[0]) for w, pos, t in tgs]
        return tgs

    TRAIN_DATA = [preprocess(sent) for sent in corpus_train]
    TEST_DATA = [preprocess(sent) for sent in corpus_test]

    cP = ChunkTagger(TRAIN_DATA)
    cP.evaluate(TEST_DATA)

    # TRAIN_DATA_FILE = open("../assignment2dataset/train.txt","r")

Example #37

0

Show file

File: develop_evaluate.py Project: MingjunZhou/nltk_book

        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) for (w, t, c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.util.conlltags2tree(conlltags)


if __name__ == '__main__':
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    #unigram_chunker = BigramChunker(train_sents)
    # unigram_chunker = UnigramChunker(train_sents)
    #print unigram_chunker.evaluate(test_sents)
    nltk.config_megam(bin='/home/alpha/work/env/megam_0.92/./megam')
    chunker = ConsecutiveNPChunker(train_sents)
    print chunker.evaluate(test_sents)

Example #38

0

Show file

File: ner.py Project: kaiyaunchen/nlp

import nltk
import collections
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk import config_megam
import re
import numpy as np

#nltk.download('punkt')
PATH_TO_MEGAM_EXECUTABLE = "./MEGAM/megam-64"
config_megam(PATH_TO_MEGAM_EXECUTABLE)

def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger

def get_ner_items():
    items = []
    with open("3c.txt", 'r') as f: 
        for line in f: 
            items.append(word_tokenize(line[:-1]))
    return items

def get_ner_words(ner_items):
    ner_words = set()
    for ner_item in ner_items:
        for ner_word in ner_item:

Example #39

0

Show file

File: train_chunk_tagger.py Project: albertfdp/dtu-data-mining

    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')


# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged

Example #40

0

Show file

File: model_trainer.py Project: mekjr1/NLP_NP_Chunker_Model_trainer

def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
        return '+'.join(sorted(tags))


test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

#train and saving the classifier
nltk.config_megam('megam.exe')
chunker = ConsecutiveNPChunker(train_sents)

from pickle import dump
output = open("chunkModelSVMFeat1.pkl", "wb")
dump(chunker, output, -1)
output.close()
results = chunker.evaluate(test_sents)

print()
#print(results.incorrect())
print("-----------------------------")
#print(results.missed())
print("----------------------------")
#print(chunker.tagger.classifier.show_most_informative_features(n=20,show="all"))
#print(chunker.classifier.explain(featureset,columns=4))

Example #41

0

Show file

File: extractor.py Project: digging-into-data-berkeley/cheshire3

        for sentence_tree in tree:
            eval_corpus.append(split_tree_tokens(sentence_tree))
    # Evaluate model
    print 'Evaluating...'
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_corpus):
        guessed = chunker.parse(correct.leaves())
        guessed = chunker._parse_to_tagged(guessed)
        chunkscore.score(correct, guessed)
        if i < 3:
            cmp_chunks(correct, guessed)
    print chunkscore
    return chunker


_EVENT_TRAIN_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/train')
                                              
_EVENT_EVAL_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/eval')

config_megam(resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/megam_i686.opt'))


if __name__ == "__main__":
    chunker = build_event_chunking_model()

Example #42

0

Show file

File: mxpost.py Project: davidsbatista/minhash-classifier

    nltk.download()

in the Python interpreter. Proper usage of demo() and all other functions and
methods is described below.
"""

import time
import re
import pickle

from collections import defaultdict
from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier
from nltk.corpus.reader.conll import ConllCorpusReader

config_megam('/home/dsbatista/megam_i686.opt')


class MaxentPosTagger(TaggerI):
    """
    MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models.
    """
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        """
        MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
        sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences. Each sentence is

Example #43

0

Show file

    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')

# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged

        """