Python config_megamの例、nltk.config_megam Pythonの例

コード例 #1

0

ファイルを表示

 def __init__(self, train_sents=None, pickle_name=None, save=False):
     '''Trains new tagger model or loads existing one.'''
     self._pickle_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), 'pickles', pickle_name)
     if train_sents == None:
         # Load exisiting model
         with open(self._pickle_path, 'rb') as f:
             self.classifier = pickle.load(f)
     else:
         train_set = []
         for tagged_sent in train_sents:
             untagged_sent = nltk.tag.untag(tagged_sent)
             history = []
             for i, (word, t) in enumerate(tagged_sent):
                 featureset = self.npchunk_features(untagged_sent, i,
                                                    history)
                 train_set.append((featureset, t))
                 history.append(t)
         nltk.config_megam(r'c:\progs\megam\megam.exe')
         self.classifier = nltk.MaxentClassifier.train(train_set,
                                                       algorithm='megam',
                                                       trace=0)
         if save:
             # save newly trained model with name specified
             with open(self._pickle_path, 'wb') as f:
                 pickle.dump(self.classifier, f)

コード例 #2

0

ファイルを表示

ファイル: full-dataset-max-ent.py プロジェクト: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    tweets = classify.get_tweets_to_classify(conn_analysis)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = classifier.classify(classify.process_tweet(text))
        update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print count_table

コード例 #3

0

ファイルを表示

def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}


	test_tweets = classify.get_test_tweets(conn_analysis)
	test_feature_set = process_tweets(test_tweets)

	classifier.show_most_informative_features(10)
	classifier_accuracy = accuracy(classifier, test_feature_set)
	print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #4

0

ファイルを表示

ファイル: max-ent-sig-words.py プロジェクト: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    classifier.show_most_informative_features(10)

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #5

0

ファイルを表示

ファイル: full-dataset-max-ent.py プロジェクト: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	tweets = classify.get_tweets_to_classify(conn_analysis);

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = classifier.classify(classify.process_tweet(text))
		update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print count_table

コード例 #6

0

ファイルを表示

ファイル: max-ent-refactored.py プロジェクト: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['default']['HOST'],
                           user=DATABASES['default']['USER'],
                           passwd=DATABASES['default']['PASSWORD'],
                           db=DATABASES['default']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    config_megam('/opt/packages')
    #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    classifier = NaiveBayesClassifier.train(training_feature_set)
    #classifier.show_most_informative_features(50, show='pos')
    #classifier.show_most_informative_features(50, show='neg')

    #classifier.explain(training_feature_set[0][0])
    #print training_feature_set[0]

    error_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    count_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    guess_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)
    test_feature_set = classify.process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #print count_table
    print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #7

0

ファイルを表示

ファイル: a-ensemble-bayes-max-ent.py プロジェクト: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in test_tweets:
		text = classify.get_tweet_text(conn, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_tweet_polarity_ensemble(tweet[0], guess, conn)
		count_table[guess] += 1

	print "Maximum Entropy"
	print count_table

	#generate the accuracy matrix
	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	for tweet in test_tweets:
		result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
		guess = result[0][0]

		actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0])
		actual = actual_result[0][0]

		if guess is not None:
			if actual is not None:
				full_matrix[actual][guess] += 1

	print full_matrix

コード例 #8

0

ファイルを表示

ファイル: rte_classify.py プロジェクト: LowResourceLanguages/hltdi-l3

def demo():            
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)

コード例 #9

0

ファイルを表示

ファイル: rte_classify.py プロジェクト: chethankumarka/SuaaS

def demo():
    import nltk
    try:
        nltk.config_megam('/usr/local/bin/megam')
        trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
    except ValueError:
        try:
            trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
        except ValueError:
            trainer = nltk.MaxentClassifier.train
    nltk.classify.rte_classifier(trainer)

コード例 #10

0

ファイルを表示

ファイル: NERDb.py プロジェクト: gabsl/IMDBot

 def __init__(self, tagger, chunked_sents):
     tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in chunked_sents]
     train_set = []
     for tagged_sent in tagged_sents:
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             featureset = npchunk_features(untagged_sent, i, history)
             train_set.append((featureset, tag))
             history.append(tag)
     labels = set(label for (tok, label) in train_set)
     nltk.config_megam("./megam_i686.opt")
     self.classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
     self.tagger = tagger

コード例 #11

0

ファイルを表示

ファイル: chunk_tagger.py プロジェクト: Mo-Talha/Nomad

    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)

コード例 #12

0

ファイルを表示

ファイル: max-ent-sig-words-bigrams.py プロジェクト: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'],
                           user=DATABASES['date_cutoff']['USER'],
                           passwd=DATABASES['date_cutoff']['PASSWORD'],
                           db=DATABASES['date_cutoff']['NAME'])

    training_tweets = get_test_tweets(conn)
    #training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, '+', total_word_count,
                                           best_words)
    training_feature_set += process_bigrams(conn, '-', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'I', total_word_count,
                                            best_words)
    training_feature_set += process_bigrams(conn, 'O', total_word_count,
                                            best_words)

    print "configuring megam"
    config_megam('/opt/packages')
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set,
                                        algorithm="megam",
                                        trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    #full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    #			'-':{'+':0, '-':0, 'I':0, 'O':0},
    #			'I':{'+':0, '-':0, 'I':0, 'O':0},
    #			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    #for f in test_tweets:
    #	guess = classifier.classify(process_tweet(f[1]))
    #	full_matrix[f[2]][guess] += 1

    #print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #13

0

ファイルを表示

def main_function():
    conn_analysis = MySQLdb.connect(host="localhost",
                                    user="******",
                                    passwd="tanzania",
                                    db="twitter_heart")

    training_tweets = classify.get_training_tweets(conn_analysis)
    training_feature_set = classify.process_tweets(training_tweets)

    tweets = classify.get_tweets_to_classify(conn_analysis)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in tweets:
        text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_max_ent_polarity(tweet[0], guess, conn_analysis)
        count_table[guess] += 1

    #For the tweets where polarity was determined manually, copy from
    #majority_vote to auto_vote
    fix_manual_tweets(conn_analysis)

    print "Maximum Entropy"
    print count_table

コード例 #14

0

ファイルを表示

ファイル: chunk_tagger.py プロジェクト: jizhihang/Dropple

    def __init__(self, train_sentences):
        nltk.config_megam('{}/../algorithms/megam-64.opt'.format(
            os.path.dirname(os.path.abspath(__file__))))

        train_set = []

        # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..]
        for tagged_sent in train_sentences:

            # untagged_sent: [(u'Experience', u'NN')]
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = self.chunk_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set,
                                                      algorithm='megam',
                                                      trace=0)

コード例 #15

0

ファイルを表示

ファイル: max-ent-sig-words-bigrams.py プロジェクト: khandelwal/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(
        host=DATABASES["date_cutoff"]["HOST"],
        user=DATABASES["date_cutoff"]["USER"],
        passwd=DATABASES["date_cutoff"]["PASSWORD"],
        db=DATABASES["date_cutoff"]["NAME"],
    )

    training_tweets = get_test_tweets(conn)
    # training_feature_set = process_tweets(training_tweets)

    total_word_count = total_words(conn)
    training_feature_set = process_bigrams(conn, "+", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "-", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "I", total_word_count, best_words)
    training_feature_set += process_bigrams(conn, "O", total_word_count, best_words)

    print "configuring megam"
    config_megam("/opt/packages")
    print "starting training"
    classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
    print "starting end training"
    classifier.show_most_informative_features(40)

    test_tweets = get_training_tweets(conn)
    test_feature_set = process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'-':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'I':{'+':0, '-':0, 'I':0, 'O':0},
    # 			'O':{'+':0, '-':0, 'I':0, 'O':0}}

    # for f in test_tweets:
    # 	guess = classifier.classify(process_tweet(f[1]))
    # 	full_matrix[f[2]][guess] += 1

    # print full_matrix
    print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #16

0

ファイルを表示

ファイル: max-ent-sig-words.py プロジェクト: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	total_word_count = total_words(conn)
	training_feature_set = process_bigrams(conn, '+', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, '-', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'I', total_word_count, best_words)
	training_feature_set += process_bigrams(conn, 'O', total_word_count, best_words)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	classifier.show_most_informative_features(10)

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #17

0

ファイルを表示

ファイル: part_of_speech_tagging.py プロジェクト: BillTheBest/tf_core

def nltk_maxent_pos_tagger(input_dict):
    name = 'MaxentPosTagger'
    if not input_dict['training_corpus']:
        maxent_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
        name += '-pretrained'
    else:
        nltk.config_megam(settings.MEGAM_EXECUTABLE_PATH)

        maxent_tagger = MaxentPosTagger()
        chunk = input_dict['training_corpus']['chunk']
        corpus = input_dict['training_corpus']['corpus']
        training_corpus=corpus_reader(corpus, chunk)
        if training_corpus:
            maxent_tagger.train(training_corpus)
        else:
            raise AttributeError

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': maxent_tagger,
                'name': name
            }
    }

コード例 #18

0

ファイルを表示

ファイル: max-ent-refactored.py プロジェクト: 7andrew7/vaccine-sentiment

def main_function():
	conn = MySQLdb.connect(host=DATABASES['default']['HOST'], 
			user=DATABASES['default']['USER'], 
			passwd=DATABASES['default']['PASSWORD'], 
			db=DATABASES['default']['NAME'])

	training_tweets = classify.get_training_tweets(conn)
	training_feature_set = classify.process_tweets(training_tweets)

	config_megam('/opt/packages')
	#classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)
	classifier = NaiveBayesClassifier.train(training_feature_set)
	#classifier.show_most_informative_features(50, show='pos')
	#classifier.show_most_informative_features(50, show='neg')

	#classifier.explain(training_feature_set[0][0])
	#print training_feature_set[0]

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	test_tweets = classify.get_test_tweets(conn)
	test_feature_set = classify.process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	#print count_table
	print "classifier accuracy: " + repr(classifier_accuracy)

コード例 #19

0

ファイルを表示

ファイル: full-dataset-ensemble-a.py プロジェクト: 7andrew7/vaccine-sentiment

def main_function():
	conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart")

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = classify.process_tweets(training_tweets)

	tweets = classify.get_tweets_to_classify(conn_analysis);

	bayes_classifier = NaiveBayesClassifier.train(training_feature_set)
	count_table = {'+':0, '-':0, 'I':0, 'O':0}  

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = bayes_classifier.classify(classify.process_tweet(text))
		classify.update_tweet_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	print "Naive Bayes"
	print count_table

	count_table = {'+':0, '-':0, 'I':0, 'O':0}  
	config_megam('/opt/packages')
	max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	for tweet in tweets:
		text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0]
		guess = max_ent_classifier.classify(classify.process_tweet(text))
		update_max_ent_polarity(tweet[0], guess, conn_analysis)
		count_table[guess] += 1

	#For the tweets where polarity was determined manually, copy from 
	#majority_vote to auto_vote
	fix_manual_tweets(conn_analysis)

	print "Maximum Entropy"
	print count_table

コード例 #20

0

ファイルを表示

# an implementation of the probabalistic chunker from NLTK
import nltk
import cPickle
nltk.config_megam('/home/chris/programs/megam_0.92/megam')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,

コード例 #21

0

ファイルを表示

ファイル: word_presence_thru_dist.py プロジェクト: srniranjan/sentiment_analysis_spikes

most_suggestive_words = [w.rstrip() for w in words]

f = open('../../data_files/yelp_sent_pos_text.txt')
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
	tips_pos.append( eval(line) )

print 'Building feature set...', datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
	features = doc_word_presence(tip)
	featuresets.append( (features, tag) )
	count += 1
	if count == int(len(tips_pos)):
		break

size = int(len(featuresets)/2)
train_set, test_set = featuresets[:size], featuresets[size:]
print 'Training classifier...', datetime.datetime.now()
#classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam('/Users/admin/Downloads/megam_0.92/megam')
classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam')
print 'Finished training classifier', datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, '../../results/prob_dist/top_word_presence.txt')

コード例 #22

0

ファイルを表示

ファイル: code_classifier_chunker.py プロジェクト: nishantagarwal1990/QASystem

# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}

コード例 #23

0

ファイルを表示

#!/usr/bin/python
# -*- coding: utf-8 -*-

import MySQLdb as mdb
import sys
import re
from nltk import config_megam
from nltk import MaxentClassifier
from nltk import classify
from numpy import array_split
import os.path

config_megam('/var/www/test/tagging/tools/db/tags/classifier/MEGAM/megam-64.opt')

algorithm = 'MEGAM'



def getClassesDict():
    f = open('hindiclasses.sorted.txt')
    classesDict = dict()
    for line in f:
        fields = line.split()
        classesDict[fields[0]] = fields[1]
    f.close()
    return classesDict

classesDict = getClassesDict();


def extractWordFeature(wordDict, sentence, i):

コード例 #24

0

ファイルを表示

ファイル: top_word_presence.py プロジェクト: srniranjan/sentiment_analysis_spikes

most_suggestive_words = most_freq_pos_words.union(most_freq_neg_words).union(most_freq_unk_words)

f = open("../../data_files/yelp_sent_pos_text.txt")
lines = f.readlines()
f.close()
tips_pos = []
for line in lines:
    tips_pos.append(eval(line))

print "Building feature set...", datetime.datetime.now()
count = 0
featuresets = []
for tip, tag in tips_pos:
    features = doc_word_presence(tip)
    featuresets.append((features, tag))
    count += 1
    if count == int(len(tips_pos)):
        break

size = int(len(featuresets) / 2)
train_set, test_set = featuresets[:size], featuresets[size:]
print "Training classifier...", datetime.datetime.now()
# classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.config_megam("/Users/admin/Downloads/megam_0.92/megam")
classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam")
print "Finished training classifier", datetime.datetime.now()
print nltk.classify.accuracy(classifier, test_set)

show_confusion_matrix(classifier, test_set)
write_probdist_for(classifier, "../../results/prob_dist/top_word_presence.txt")

コード例 #25

0

ファイルを表示

#!/usr/bin/python

import nltk
#nltk.config_megam('/home/aritter/local/bin/megam')
nltk.config_megam('/usr/local/bin/megam')

import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1

コード例 #26

0

ファイルを表示

ファイル: classifier.py プロジェクト: ThiagoCF05/LinearAMR

    Training: ClassifierTraining
    Prediction: Classifier
"""
import sys
sys.path.append('../')
import argparse
import cPickle as p
import operator
import os
import utils

from ERG import AMR

import nltk
from nltk.classify import MaxentClassifier, accuracy
nltk.config_megam("/usr/local/bin/megam.opt")
from scipy.stats import rankdata


class ClassifierTraining(object):
    def __init__(self, ftrain, fdev, ftest, wdir, delexicalized=True):
        self.train_amrs, self.dev_amrs, self.test_amrs = [], [], []
        self.delexicalized = delexicalized

        print 'PARSING...'
        for f in os.listdir(ftrain):
            self.train_amrs.extend(self.parse(os.path.join(ftrain, f)))

        for f in os.listdir(fdev):
            self.dev_amrs.extend(self.parse(os.path.join(fdev, f)))

コード例 #27

0

ファイルを表示

#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')

import sys
import math
import random

MAX_NGRAM = 6


class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1, MAX_NGRAM):
                for i in range(len(words) - gram + 1):
                    ngram = " ".join(words[i:i + gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1

コード例 #28

0

ファイルを表示

ファイル: code_classifier_chunker.py プロジェクト: JoeHill/julian

import nltk
import os
from cPickle import load, dump

from nltk.corpus import conll2000

from settings import ROOT

chunker_path = ROOT + 'vendor/parsers/consecutive_np_chunker.pk1'
megam_path = ROOT + 'vendor/megam_i686.opt'
nltk.config_megam(ROOT + 'vendor/megam_i686.opt')

# Natural Language Toolkit: code_classifier_chunker
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {'pos': pos, 'word': word, 'prevpos': prevpos}


class ConsecutiveNPChunkTagger(nltk.TaggerI): # [_consec-chunk-tagger]

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) # [_consec-use-fe]

コード例 #29

0

ファイルを表示

# Natural Language Toolkit: code_classifier_chunker
import nltk

nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP')


def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))


def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i - 1]
    if i == len(sentence) - 1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i + 1]
    return {
        "pos": pos,
        "word": word,
        "prevpos": prevpos,
        "nextpos": nextpos,

コード例 #30

0

ファイルを表示

 def test_rte_classification_with_megam(self):
     try:
         config_megam()
     except (LookupError, AttributeError) as e:
         pytest.skip("Skipping tests with dependencies on MEGAM")
     clf = rte_classifier("megam", sample_N=100)

コード例 #31

0

ファイルを表示

ファイル: memm.py プロジェクト: yydjay1100/Code-for-CSCI544-Group-58

import time
import re
from collections import defaultdict

from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier

PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam"
config_megam(PATH_TO_MEGAM_EXECUTABLE)


class MaxentPosTagger(TaggerI):
    def train(self,
              train_sents,
              algorithm='megam',
              rare_word_cutoff=5,
              rare_feat_cutoff=5,
              uppercase='[A-Z]',
              trace=3,
              **cutoffs):

        self.uppercase = uppercase
        self.word_freqdist = self.word_freqs(train_sents)
        self.featuresets = self.featsets(train_sents, rare_word_cutoff)
        self.features_freqdist = self.gen_the_feat_freqs(self.featuresets)
        self.cut_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()

コード例 #32

0

ファイルを表示

ファイル: train_classifier.py プロジェクト: Metaphor31/tweet_sentiment

import time
import collections
import pickle
from nltk.corpus import stopwords
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from svmutil import *
import itertools
import math

def remove_stopwords(text):
    r = stopwords.words('english')
    r.append('rt')
    return [w for w in text if not w in r]

nltk.config_megam('.')

start_time = time.time()
connection = MongoClient('localhost', 27017)
db = connection.local
sad_col = db['neg_emoticons']
hap_col = db['pos_emoticons']

h, s = [], []
s = sad_col.find()
h = hap_col.find()

pos_tweets, neg_tweets = [], []

if len(sys.argv) > 2:
    count = int(sys.argv[2]) / 2

コード例 #33

0

ファイルを表示

ファイル: max.py プロジェクト: lkfo415579/MAX

def active_megam():
	if nltk.megam._megam_bin is None:
		import os
		path = os.getcwd()
		nltk.config_megam(path+'/megam/megam-64.opt')

コード例 #34

0

ファイルを表示

ファイル: b-ensemble-bayes-max-ent.py プロジェクト: mohd1012/vaccine-sentiment

def main_function():
    conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'],
                           user=DATABASES['ensemble']['USER'],
                           passwd=DATABASES['ensemble']['PASSWORD'],
                           db=DATABASES['ensemble']['NAME'])

    training_tweets = classify.get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    bayes_classifier = NaiveBayesClassifier.train(training_feature_set)

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}

    test_tweets = classify.get_test_tweets(conn)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = bayes_classifier.classify(classify.process_tweet(text))
        classify.update_tweet_polarity(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Naive Bayes"
    print count_table

    count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0}
    config_megam('/opt/packages')
    max_ent_classifier = MaxentClassifier.train(training_feature_set,
                                                algorithm="megam",
                                                trace=0)

    for tweet in test_tweets:
        text = classify.get_tweet_text(conn, tweet[0])[0][0]
        guess = max_ent_classifier.classify(classify.process_tweet(text))
        update_tweet_polarity_ensemble(tweet[0], guess, conn)
        count_table[guess] += 1

    print "Maximum Entropy"
    print count_table

    #generate the accuracy matrix
    full_matrix = {
        '+': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        '-': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'I': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        },
        'O': {
            '+': 0,
            '-': 0,
            'I': 0,
            'O': 0
        }
    }

    for tweet in test_tweets:
        result = classify.run_sql(
            conn, classify.Statements.CHECK_CONSENSUS % tweet[0])
        guess = result[0][0]

        actual_result = classify.run_sql(
            conn, classify.Statements.CHECK_MAJORITY % tweet[0])
        actual = actual_result[0][0]

        if guess is not None:
            if actual is not None:
                full_matrix[actual][guess] += 1

    print full_matrix

コード例 #35

0

ファイルを表示

ファイル: style_metric.py プロジェクト: DL-nisl/Shakespeare

#!/usr/bin/python

import nltk
nltk.config_megam('/home/aritter/local/bin/megam')


import sys
import math
import random

MAX_NGRAM=6

class StyleMetric:
    source_vector = {}
    target_vector = {}

    training = []

    def __init__(self, corpus_source, corpus_target):
        #Read in Source Corpus
        nlines = 0
        for line in open(corpus_source):
            #TODO: can use more data for training maxent model if more memory is available
            line = line.strip()
            words = nltk.word_tokenize(line)
            sentenceDict = {}
            for gram in range(1,MAX_NGRAM):
                for i in range(len(words)-gram+1):
                    ngram = " ".join(words[i:i+gram])
                    self.source_vector[ngram] = 1
                    #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1

コード例 #36

0

ファイルを表示

ファイル: MEMM.py プロジェクト: bikky9/Machine-Learning

        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', None)
        df = pd.DataFrame(confusionMatrix)
        df.columns = TAGS
        df.index = TAGS
        df.style
        print(df)
        print("Accuracy :", 100 * corr / total)


if __name__ == "__main__":

    nltk.download("conll2000")
    nltk.config_megam('./megam-64.opt')
    corpus_train = nltk.corpus.conll2000.chunked_sents('train.txt')
    corpus_test = nltk.corpus.conll2000.chunked_sents("test.txt")

    def preprocess(sent):
        tgs = nltk.tree2conlltags(sent)
        tgs = [(w, pos, t[0]) for w, pos, t in tgs]
        return tgs

    TRAIN_DATA = [preprocess(sent) for sent in corpus_train]
    TEST_DATA = [preprocess(sent) for sent in corpus_test]

    cP = ChunkTagger(TRAIN_DATA)
    cP.evaluate(TEST_DATA)

    # TRAIN_DATA_FILE = open("../assignment2dataset/train.txt","r")

コード例 #37

0

ファイルを表示

ファイル: develop_evaluate.py プロジェクト: MingjunZhou/nltk_book

        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) for (w, t, c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.util.conlltags2tree(conlltags)


if __name__ == '__main__':
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    #unigram_chunker = BigramChunker(train_sents)
    # unigram_chunker = UnigramChunker(train_sents)
    #print unigram_chunker.evaluate(test_sents)
    nltk.config_megam(bin='/home/alpha/work/env/megam_0.92/./megam')
    chunker = ConsecutiveNPChunker(train_sents)
    print chunker.evaluate(test_sents)

コード例 #38

0

ファイルを表示

ファイル: ner.py プロジェクト: kaiyaunchen/nlp

import nltk
import collections
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk import config_megam
import re
import numpy as np

#nltk.download('punkt')
PATH_TO_MEGAM_EXECUTABLE = "./MEGAM/megam-64"
config_megam(PATH_TO_MEGAM_EXECUTABLE)

def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger

def get_ner_items():
    items = []
    with open("3c.txt", 'r') as f: 
        for line in f: 
            items.append(word_tokenize(line[:-1]))
    return items

def get_ner_words(ner_items):
    ner_words = set()
    for ner_item in ner_items:
        for ner_word in ner_item:

コード例 #39

0

ファイルを表示

ファイル: train_chunk_tagger.py プロジェクト: albertfdp/dtu-data-mining

    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')


# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged

コード例 #40

0

ファイルを表示

ファイル: model_trainer.py プロジェクト: mekjr1/NLP_NP_Chunker_Model_trainer

def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
        return '+'.join(sorted(tags))


test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

#train and saving the classifier
nltk.config_megam('megam.exe')
chunker = ConsecutiveNPChunker(train_sents)

from pickle import dump
output = open("chunkModelSVMFeat1.pkl", "wb")
dump(chunker, output, -1)
output.close()
results = chunker.evaluate(test_sents)

print()
#print(results.incorrect())
print("-----------------------------")
#print(results.missed())
print("----------------------------")
#print(chunker.tagger.classifier.show_most_informative_features(n=20,show="all"))
#print(chunker.classifier.explain(featureset,columns=4))

コード例 #41

0

ファイルを表示

ファイル: extractor.py プロジェクト: digging-into-data-berkeley/cheshire3

        for sentence_tree in tree:
            eval_corpus.append(split_tree_tokens(sentence_tree))
    # Evaluate model
    print 'Evaluating...'
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_corpus):
        guessed = chunker.parse(correct.leaves())
        guessed = chunker._parse_to_tagged(guessed)
        chunkscore.score(correct, guessed)
        if i < 3:
            cmp_chunks(correct, guessed)
    print chunkscore
    return chunker


_EVENT_TRAIN_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/train')
                                              
_EVENT_EVAL_DATA_PATH = resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/events/eval')

config_megam(resource_filename(
    Requirement.parse('cheshire3'),
    'cheshire3/data/textmining/megam_i686.opt'))


if __name__ == "__main__":
    chunker = build_event_chunking_model()

コード例 #42

0

ファイルを表示

ファイル: mxpost.py プロジェクト: davidsbatista/minhash-classifier

    nltk.download()

in the Python interpreter. Proper usage of demo() and all other functions and
methods is described below.
"""

import time
import re
import pickle

from collections import defaultdict
from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier
from nltk.corpus.reader.conll import ConllCorpusReader

config_megam('/home/dsbatista/megam_i686.opt')


class MaxentPosTagger(TaggerI):
    """
    MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models.
    """
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        """
        MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
        sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences. Each sentence is

コード例 #43

0

ファイルを表示

    such as persons, locations and organizations in a given document.

    (For the purpose of the training an external maximun entropy model (megam)\
    is used.

After the chunker has been created, it is pickled for subsequent use.
"""

import nltk
import logging
import pickle

MEGAM_FOLDER = 'topics/megam_0.92/megam'

try:
    nltk.config_megam(MEGAM_FOLDER)
except LookupError:
    nltk.config_megam('megam_0.92/megam')

# define logging configuration
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')


class BigramChunker(nltk.ChunkParserI):
    """This class defines a bigram chunker"""
    def __init__(self, train_sents):
        """Construct a new BigramChunker instance.
            :param train_sents: Array of sentences with named entities tagged

        """