コード例 #1
0
 def __init__(self, train_sents=None, pickle_name=None, save=False):
     '''Trains new tagger model or loads existing one.'''
     self._pickle_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), 'pickles', pickle_name)
     if train_sents == None:
         # Load exisiting model
         with open(self._pickle_path, 'rb') as f:
             self.classifier = pickle.load(f)
     else:
         train_set = []
         for tagged_sent in train_sents:
             untagged_sent = nltk.tag.untag(tagged_sent)
             history = []
             for i, (word, t) in enumerate(tagged_sent):
                 featureset = self.npchunk_features(untagged_sent, i,
                                                    history)
                 train_set.append((featureset, t))
                 history.append(t)
         nltk.config_megam(r'c:\progs\megam\megam.exe')
         self.classifier = nltk.MaxentClassifier.train(train_set,
                                                       algorithm='megam',
                                                       trace=0)
         if save:
             # save newly trained model with name specified
             with open(self._pickle_path, 'wb') as f:
                 pickle.dump(self.classifier, f)
コード例 #2
0
def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    tweets = classify.get_tweets_to_classify(conn_analysis)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = classifier.classify(classify.process_tweet(text))
        update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print count_table
コード例 #3
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}


	test_tweets = classify.get_test_tweets(conn_analysis)
	test_feature_set = process_tweets(test_tweets)

	classifier.show_most_informative_features(10)
	classifier_accuracy = accuracy(classifier, test_feature_set)
	print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #4
0
def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    classifier.show_most_informative_features(10)

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #5
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn_analysis);

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print count_table
コード例 #6
0
def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    #classifier.show_most_informative_features(50, show='pos')
    #classifier.show_most_informative_features(50, show='neg')

    #classifier.explain(training_feature_set[0][0])
    #print training_feature_set[0]

    error_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    count_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    guess_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = classify.process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #print count_table
    print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #7
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_tweet_polarity_ensemble(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Maximum Entropy"
	print count_table

	#generate the accuracy matrix
	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	for tweet in test_tweets:
		result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
		guess = result[0][0]

		actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0])
		actual = actual_result[0][0]

		if guess is not None:
			if actual is not None:
				full_matrix[actual][guess] += 1

	print full_matrix
コード例 #8
0
def demo():            
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)
コード例 #9
0
ファイル: rte_classify.py プロジェクト: chethankumarka/SuaaS
def demo():
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)
コード例 #10
0
ファイル: NERDb.py プロジェクト: gabsl/IMDBot
 def __init__(self, tagger, chunked_sents):
     tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in chunked_sents]
     train_set = []
     for tagged_sent in tagged_sents:
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             featureset = npchunk_features(untagged_sent, i, history)
             train_set.append((featureset, tag))
             history.append(tag)
     labels = set(label for (tok, label) in train_set)
     nltk.config_megam("./megam_i686.opt")
     self.classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
     self.tagger = tagger
コード例 #11
0
ファイル: chunk_tagger.py プロジェクト: Mo-Talha/Nomad
    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)
コード例 #12
0
def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    training_tweets = get_test_tweets(conn)
    #training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    print "configuring megam"
    config_megam('/opt/packages')
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    #			'-':{'+':0, '-':0, 'I':0, 'O':0},
    #			'I':{'+':0, '-':0, 'I':0, 'O':0},
    #			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    #for f in test_tweets:
    #	guess = classifier.classify(process_tweet(f[1]))
    #	full_matrix[f[2]][guess] += 1

    #print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #13
0
def main_function():
    conn_analysis = MySQLdb.connect(host="localhost",
                                    user="******",
                                    passwd="tanzania",
                                    db="twitter_heart")

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    tweets = classify.get_tweets_to_classify(conn_analysis)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_max_ent_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print "Maximum Entropy"
    print count_table
コード例 #14
0
ファイル: chunk_tagger.py プロジェクト: jizhihang/Dropple
    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(
            os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set,
                                                      algorithm='megam',
                                                      trace=0)
コード例 #15
0
def main_function():
    conn = MySQLdb.connect(
        host=DATABASES["date_cutoff"]["HOST"],
        user=DATABASES["date_cutoff"]["USER"],
        passwd=DATABASES["date_cutoff"]["PASSWORD"],
        db=DATABASES["date_cutoff"]["NAME"],
    )

    training_tweets = get_test_tweets(conn)
    # training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, "+", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "-", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "I", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "O", total_word_count, best_words)

    print "configuring megam"
    config_megam("/opt/packages")
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'-':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'I':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    # for f in test_tweets:
    # 	guess = classifier.classify(process_tweet(f[1]))
    # 	full_matrix[f[2]][guess] += 1

    # print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #16
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	total_word_count = total_words(conn)
	training_feature_set = process_bigrams(conn, '+', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, '-', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'I', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'O', total_word_count, best_words)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	classifier.show_most_informative_features(10)

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #17
0
def nltk_maxent_pos_tagger(input_dict):
    name = 'MaxentPosTagger'
    if not input_dict['training_corpus']:
        maxent_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
        name += '-pretrained'
    else:
        nltk.config_megam(settings.MEGAM_EXECUTABLE_PATH)

        maxent_tagger = MaxentPosTagger()
        chunk = input_dict['training_corpus']['chunk']
        corpus = input_dict['training_corpus']['corpus']
        training_corpus=corpus_reader(corpus, chunk)
        if training_corpus:
            maxent_tagger.train(training_corpus)
        else:
            raise AttributeError

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': maxent_tagger,
                'name': name
            }
    }
コード例 #18
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	#classifier.show_most_informative_features(50, show='pos')
	#classifier.show_most_informative_features(50, show='neg')

	#classifier.explain(training_feature_set[0][0])
	#print training_feature_set[0]

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = classify.process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	#print count_table
	print "classifier accuracy: " + repr(classifier_accuracy)
コード例 #19
0
def main_function():
	conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart")

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	tweets = classify.get_tweets_to_classify(conn_analysis);

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_max_ent_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print "Maximum Entropy"
	print count_table
コード例 #20
0
# an implementation of the probabalistic chunker from NLTK
import nltk
import cPickle
nltk.config_megam('/home/chris/programs/megam_0.92/megam')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,
コード例 #21
0
most_suggestive_words = [w.rstrip() for w in words]

f = open('../../data_files/yelp_sent_pos_text.txt')
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
	tips_pos.append( eval(line) )

print 'Building feature set...', datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
	features = doc_word_presence(tip)
	featuresets.append( (features, tag) )
	count += 1
	if count == int(len(tips_pos)):
		break

size = int(len(featuresets)/2)
train_set, test_set = featuresets[:size], featuresets[size:]
print 'Training classifier...', datetime.datetime.now()
#classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam('/Users/admin/Downloads/megam_0.92/megam')
classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam')
print 'Finished training classifier', datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, '../../results/prob_dist/top_word_presence.txt')
コード例 #22
0
# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}
コード例 #23
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import MySQLdb as mdb
import sys
import re
from nltk import config_megam
from nltk import MaxentClassifier
from nltk import classify
from numpy import array_split
import os.path

config_megam('/var/www/test/tagging/tools/db/tags/classifier/MEGAM/megam-64.opt')

algorithm = 'MEGAM'



def getClassesDict():
    f = open('hindiclasses.sorted.txt')
    classesDict = dict()
    for line in f:
        fields = line.split()
        classesDict[fields[0]] = fields[1]
    f.close()
    return classesDict

classesDict = getClassesDict();


def extractWordFeature(wordDict, sentence, i):
コード例 #24
0
most_suggestive_words = most_freq_pos_words.union(most_freq_neg_words).union(most_freq_unk_words)

f = open("../../data_files/yelp_sent_pos_text.txt")
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
    tips_pos.append(eval(line))

print "Building feature set...", datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
    features = doc_word_presence(tip)
    featuresets.append((features, tag))
    count += 1
    if count == int(len(tips_pos)):
        break

size = int(len(featuresets) / 2)
train_set, test_set = featuresets[:size], featuresets[size:]
print "Training classifier...", datetime.datetime.now()
# classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam("/Users/admin/Downloads/megam_0.92/megam")
classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
print "Finished training classifier", datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, "../../results/prob_dist/top_word_presence.txt")
コード例 #25
0
#!/usr/bin/python

import nltk
#nltk.config_megam('/home/aritter/local/bin/megam')
nltk.config_megam('/usr/local/bin/megam')

import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
コード例 #26
0
ファイル: classifier.py プロジェクト: ThiagoCF05/LinearAMR
    Training: ClassifierTraining
    Prediction: Classifier
"""
import sys
sys.path.append('../')
import argparse
import cPickle as p
import operator
import os
import utils

from ERG import AMR

import nltk
from nltk.classify import MaxentClassifier, accuracy
nltk.config_megam("/usr/local/bin/megam.opt")
from scipy.stats import rankdata


class ClassifierTraining(object):
    def __init__(self, ftrain, fdev, ftest, wdir, delexicalized=True):
        self.train_amrs, self.dev_amrs, self.test_amrs = [], [], []
        self.delexicalized = delexicalized

        print 'PARSING...'
        for f in os.listdir(ftrain):
            self.train_amrs.extend(self.parse(os.path.join(ftrain, f)))

        for f in os.listdir(fdev):
            self.dev_amrs.extend(self.parse(os.path.join(fdev, f)))
コード例 #27
0
#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')

import sys
import math
import random

MAX_NGRAM = 6


class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1, MAX_NGRAM):
                for i in range(len(words) - gram + 1):
                    ngram = " ".join(words[i:i + gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
コード例 #28
0
import nltk
import os
from cPickle import load, dump

from nltk.corpus import conll2000

from settings import ROOT

chunker_path = ROOT + 'vendor/parsers/consecutive_np_chunker.pk1'
megam_path = ROOT + 'vendor/megam_i686.opt'
nltk.config_megam(ROOT + 'vendor/megam_i686.opt')

# Natural Language Toolkit: code_classifier_chunker
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {'pos': pos, 'word': word, 'prevpos': prevpos}


class ConsecutiveNPChunkTagger(nltk.TaggerI): # [_consec-chunk-tagger]

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) # [_consec-use-fe]
コード例 #29
0
# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,
コード例 #30
0
 def test_rte_classification_with_megam(self):
     try:
         config_megam()
     except (LookupError, AttributeError) as e:
         pytest.skip("Skipping tests with dependencies on MEGAM")
     clf = rte_classifier("megam", sample_N=100)
コード例 #31
0
import time
import re
from collections import defaultdict

from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier

PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam"
config_megam(PATH_TO_MEGAM_EXECUTABLE)


class MaxentPosTagger(TaggerI):
    def train(self,
              train_sents,
              algorithm='megam',
              rare_word_cutoff=5,
              rare_feat_cutoff=5,
              uppercase='[A-Z]',
              trace=3,
              **cutoffs):

        self.uppercase = uppercase
        self.word_freqdist = self.word_freqs(train_sents)
        self.featuresets = self.featsets(train_sents, rare_word_cutoff)
        self.features_freqdist = self.gen_the_feat_freqs(self.featuresets)
        self.cut_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()
コード例 #32
0
import time
import collections
import pickle
from nltk.corpus import stopwords
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from svmutil import *
import itertools
import math

def remove_stopwords(text):
    r = stopwords.words('english')
    r.append('rt')
    return [w for w in text if not w in r]

nltk.config_megam('.')

start_time = time.time()
connection = MongoClient('localhost', 27017)
db = connection.local
sad_col = db['neg_emoticons']
hap_col = db['pos_emoticons']

h, s = [], []
s = sad_col.find()
h = hap_col.find()

pos_tweets, neg_tweets = [], []

if len(sys.argv) > 2:
    count = int(sys.argv[2]) / 2
コード例 #33
0
ファイル: max.py プロジェクト: lkfo415579/MAX
def active_megam():
	if nltk.megam._megam_bin is None:
		import os
		path = os.getcwd()
		nltk.config_megam(path+'/megam/megam-64.opt')
コード例 #34
0
def main_function():
    conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'],
                           user=DATABASES['ensemble']['USER'],
                           passwd=DATABASES['ensemble']['PASSWORD'],
                           db=DATABASES['ensemble']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_tweet_polarity_ensemble(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Maximum Entropy"
    print count_table

    #generate the accuracy matrix
    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    for tweet in test_tweets:
        result = classify.run_sql(
            conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
        guess = result[0][0]

        actual_result = classify.run_sql(
            conn, classify.Statements.CHECK_MAJORITY % tweet[0])
        actual = actual_result[0][0]

        if guess is not None:
            if actual is not None:
                full_matrix[actual][guess] += 1

    print full_matrix
コード例 #35
0
ファイル: style_metric.py プロジェクト: DL-nisl/Shakespeare
#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')


import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
コード例 #36
0
ファイル: MEMM.py プロジェクト: bikky9/Machine-Learning
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', None)
        df = pd.DataFrame(confusionMatrix)
        df.columns = TAGS
        df.index = TAGS
        df.style
        print(df)
        print("Accuracy :", 100 * corr / total)


if __name__ == "__main__":

    nltk.download("conll2000")
    nltk.config_megam('./megam-64.opt')
    corpus_train = nltk.corpus.conll2000.chunked_sents('train.txt')
    corpus_test = nltk.corpus.conll2000.chunked_sents("test.txt")

    def preprocess(sent):
        tgs = nltk.tree2conlltags(sent)
        tgs = [(w, pos, t[0]) for w, pos, t in tgs]
        return tgs

    TRAIN_DATA = [preprocess(sent) for sent in corpus_train]
    TEST_DATA = [preprocess(sent) for sent in corpus_test]

    cP = ChunkTagger(TRAIN_DATA)
    cP.evaluate(TEST_DATA)

    # TRAIN_DATA_FILE = open("../assignment2dataset/train.txt","r")
コード例 #37
0
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) for (w, t, c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.util.conlltags2tree(conlltags)


if __name__ == '__main__':
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    #unigram_chunker = BigramChunker(train_sents)
    # unigram_chunker = UnigramChunker(train_sents)
    #print unigram_chunker.evaluate(test_sents)
    nltk.config_megam(bin='/home/alpha/work/env/megam_0.92/./megam')
    chunker = ConsecutiveNPChunker(train_sents)
    print chunker.evaluate(test_sents)
コード例 #38
0
ファイル: ner.py プロジェクト: kaiyaunchen/nlp
import nltk
import collections
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk import config_megam
import re
import numpy as np

#nltk.download('punkt')
PATH_TO_MEGAM_EXECUTABLE = "./MEGAM/megam-64"
config_megam(PATH_TO_MEGAM_EXECUTABLE)

def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger

def get_ner_items():
    items = []
    with open("3c.txt", 'r') as f: 
        for line in f: 
            items.append(word_tokenize(line[:-1]))
    return items

def get_ner_words(ner_items):
    ner_words = set()
    for ner_item in ner_items:
        for ner_word in ner_item:
コード例 #39
0
    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')


# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged
コード例 #40
0
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
        return '+'.join(sorted(tags))


test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

#train and saving the classifier
nltk.config_megam('megam.exe')
chunker = ConsecutiveNPChunker(train_sents)

from pickle import dump
output = open("chunkModelSVMFeat1.pkl", "wb")
dump(chunker, output, -1)
output.close()
results = chunker.evaluate(test_sents)

print()
#print(results.incorrect())
print("-----------------------------")
#print(results.missed())
print("----------------------------")
#print(chunker.tagger.classifier.show_most_informative_features(n=20,show="all"))
#print(chunker.classifier.explain(featureset,columns=4))
コード例 #41
0
        for sentence_tree in tree:
            eval_corpus.append(split_tree_tokens(sentence_tree))
    # Evaluate model
    print 'Evaluating...'
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_corpus):
        guessed = chunker.parse(correct.leaves())
        guessed = chunker._parse_to_tagged(guessed)
        chunkscore.score(correct, guessed)
        if i < 3:
            cmp_chunks(correct, guessed)
    print chunkscore
    return chunker


_EVENT_TRAIN_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/train')
                                              
_EVENT_EVAL_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/eval')

config_megam(resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/megam_i686.opt'))


if __name__ == "__main__":
    chunker = build_event_chunking_model()
コード例 #42
0
    nltk.download()

in the Python interpreter. Proper usage of demo() and all other functions and
methods is described below.
"""

import time
import re
import pickle

from collections import defaultdict
from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier
from nltk.corpus.reader.conll import ConllCorpusReader

config_megam('/home/dsbatista/megam_i686.opt')


class MaxentPosTagger(TaggerI):
    """
    MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models.
    """
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        """
        MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
        sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences. Each sentence is
コード例 #43
0
    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')

# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged

        """