def __init__(self, train_sents=None, pickle_name=None, save=False): '''Trains new tagger model or loads existing one.''' self._pickle_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'pickles', pickle_name) if train_sents == None: # Load exisiting model with open(self._pickle_path, 'rb') as f: self.classifier = pickle.load(f) else: train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, t) in enumerate(tagged_sent): featureset = self.npchunk_features(untagged_sent, i, history) train_set.append((featureset, t)) history.append(t) nltk.config_megam(r'c:\progs\megam\megam.exe') self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0) if save: # save newly trained model with name specified with open(self._pickle_path, 'wb') as f: pickle.dump(self.classifier, f)
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} tweets = classify.get_tweets_to_classify(conn_analysis) for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = classifier.classify(classify.process_tweet(text)) update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print count_table
def main_function(): conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], user=DATABASES['date_cutoff']['USER'], passwd=DATABASES['date_cutoff']['PASSWORD'], db=DATABASES['date_cutoff']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) error_dict = {'+':0, '-':0, 'I':0, 'O':0} count_dict = {'+':0, '-':0, 'I':0, 'O':0} guess_dict = {'+':0, '-':0, 'I':0, 'O':0} full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, '-':{'+':0, '-':0, 'I':0, 'O':0}, 'I':{'+':0, '-':0, 'I':0, 'O':0}, 'O':{'+':0, '-':0, 'I':0, 'O':0}} test_tweets = classify.get_test_tweets(conn_analysis) test_feature_set = process_tweets(test_tweets) classifier.show_most_informative_features(10) classifier_accuracy = accuracy(classifier, test_feature_set) print "classifier accuracy: " + repr(classifier_accuracy)
def main_function(): conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], user=DATABASES['date_cutoff']['USER'], passwd=DATABASES['date_cutoff']['PASSWORD'], db=DATABASES['date_cutoff']['NAME']) total_word_count = total_words(conn) training_feature_set = process_bigrams(conn, '+', total_word_count, best_words) training_feature_set += process_bigrams(conn, '-', total_word_count, best_words) training_feature_set += process_bigrams(conn, 'I', total_word_count, best_words) training_feature_set += process_bigrams(conn, 'O', total_word_count, best_words) config_megam('/opt/packages') #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) classifier = NaiveBayesClassifier.train(training_feature_set) classifier.show_most_informative_features(10) test_tweets = classify.get_test_tweets(conn) test_feature_set = process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) print "classifier accuracy: " + repr(classifier_accuracy)
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) count_table = {'+':0, '-':0, 'I':0, 'O':0} tweets = classify.get_tweets_to_classify(conn_analysis); for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = classifier.classify(classify.process_tweet(text)) update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print count_table
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) config_megam('/opt/packages') #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) classifier = NaiveBayesClassifier.train(training_feature_set) #classifier.show_most_informative_features(50, show='pos') #classifier.show_most_informative_features(50, show='neg') #classifier.explain(training_feature_set[0][0]) #print training_feature_set[0] error_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0} count_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0} guess_dict = {'+': 0, '-': 0, 'I': 0, 'O': 0} full_matrix = { '+': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, '-': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, 'I': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, 'O': { '+': 0, '-': 0, 'I': 0, 'O': 0 } } count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} test_tweets = classify.get_test_tweets(conn) test_feature_set = classify.process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) #print count_table print "classifier accuracy: " + repr(classifier_accuracy)
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+':0, '-':0, 'I':0, 'O':0} test_tweets = classify.get_test_tweets(conn) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+':0, '-':0, 'I':0, 'O':0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_tweet_polarity_ensemble(tweet[0], guess, conn) count_table[guess] += 1 print "Maximum Entropy" print count_table #generate the accuracy matrix full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, '-':{'+':0, '-':0, 'I':0, 'O':0}, 'I':{'+':0, '-':0, 'I':0, 'O':0}, 'O':{'+':0, '-':0, 'I':0, 'O':0}} for tweet in test_tweets: result = classify.run_sql(conn, classify.Statements.CHECK_CONSENSUS % tweet[0]) guess = result[0][0] actual_result = classify.run_sql(conn, classify.Statements.CHECK_MAJORITY % tweet[0]) actual = actual_result[0][0] if guess is not None: if actual is not None: full_matrix[actual][guess] += 1 print full_matrix
def demo(): import nltk try: nltk.config_megam('/usr/local/bin/megam') trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam') except ValueError: try: trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS') except ValueError: trainer = nltk.MaxentClassifier.train nltk.classify.rte_classifier(trainer)
def __init__(self, tagger, chunked_sents): tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in chunked_sents] train_set = [] for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = npchunk_features(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) labels = set(label for (tok, label) in train_set) nltk.config_megam("./megam_i686.opt") self.classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam") self.tagger = tagger
def __init__(self, train_sentences): nltk.config_megam('{}/../algorithms/megam-64.opt'.format(os.path.dirname(os.path.abspath(__file__)))) train_set = [] # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..] for tagged_sent in train_sentences: # untagged_sent: [(u'Experience', u'NN')] untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): feature_set = self.chunk_features(untagged_sent, i, history) train_set.append((feature_set, tag)) history.append(tag) self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)
def main_function(): conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], user=DATABASES['date_cutoff']['USER'], passwd=DATABASES['date_cutoff']['PASSWORD'], db=DATABASES['date_cutoff']['NAME']) training_tweets = get_test_tweets(conn) #training_feature_set = process_tweets(training_tweets) total_word_count = total_words(conn) training_feature_set = process_bigrams(conn, '+', total_word_count, best_words) training_feature_set += process_bigrams(conn, '-', total_word_count, best_words) training_feature_set += process_bigrams(conn, 'I', total_word_count, best_words) training_feature_set += process_bigrams(conn, 'O', total_word_count, best_words) print "configuring megam" config_megam('/opt/packages') print "starting training" classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) print "starting end training" classifier.show_most_informative_features(40) test_tweets = get_training_tweets(conn) test_feature_set = process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) #full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, # '-':{'+':0, '-':0, 'I':0, 'O':0}, # 'I':{'+':0, '-':0, 'I':0, 'O':0}, # 'O':{'+':0, '-':0, 'I':0, 'O':0}} #for f in test_tweets: # guess = classifier.classify(process_tweet(f[1])) # full_matrix[f[2]][guess] += 1 #print full_matrix print "classifier accuracy: " + repr(classifier_accuracy)
def main_function(): conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart") training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) tweets = classify.get_tweets_to_classify(conn_analysis) bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_max_ent_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print "Maximum Entropy" print count_table
def __init__(self, train_sentences): nltk.config_megam('{}/../algorithms/megam-64.opt'.format( os.path.dirname(os.path.abspath(__file__)))) train_set = [] # train_sentences as: [[(('Confidence', 'NN'), 'B-NP'), (('in', 'IN'), 'O')..], ..] for tagged_sent in train_sentences: # untagged_sent: [(u'Experience', u'NN')] untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): feature_set = self.chunk_features(untagged_sent, i, history) train_set.append((feature_set, tag)) history.append(tag) self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)
def main_function(): conn = MySQLdb.connect( host=DATABASES["date_cutoff"]["HOST"], user=DATABASES["date_cutoff"]["USER"], passwd=DATABASES["date_cutoff"]["PASSWORD"], db=DATABASES["date_cutoff"]["NAME"], ) training_tweets = get_test_tweets(conn) # training_feature_set = process_tweets(training_tweets) total_word_count = total_words(conn) training_feature_set = process_bigrams(conn, "+", total_word_count, best_words) training_feature_set += process_bigrams(conn, "-", total_word_count, best_words) training_feature_set += process_bigrams(conn, "I", total_word_count, best_words) training_feature_set += process_bigrams(conn, "O", total_word_count, best_words) print "configuring megam" config_megam("/opt/packages") print "starting training" classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) print "starting end training" classifier.show_most_informative_features(40) test_tweets = get_training_tweets(conn) test_feature_set = process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) # full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, # '-':{'+':0, '-':0, 'I':0, 'O':0}, # 'I':{'+':0, '-':0, 'I':0, 'O':0}, # 'O':{'+':0, '-':0, 'I':0, 'O':0}} # for f in test_tweets: # guess = classifier.classify(process_tweet(f[1])) # full_matrix[f[2]][guess] += 1 # print full_matrix print "classifier accuracy: " + repr(classifier_accuracy)
def nltk_maxent_pos_tagger(input_dict): name = 'MaxentPosTagger' if not input_dict['training_corpus']: maxent_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') name += '-pretrained' else: nltk.config_megam(settings.MEGAM_EXECUTABLE_PATH) maxent_tagger = MaxentPosTagger() chunk = input_dict['training_corpus']['chunk'] corpus = input_dict['training_corpus']['corpus'] training_corpus=corpus_reader(corpus, chunk) if training_corpus: maxent_tagger.train(training_corpus) else: raise AttributeError return {'pos_tagger': { 'function':'tag_sents', 'object': maxent_tagger, 'name': name } }
def main_function(): conn = MySQLdb.connect(host=DATABASES['default']['HOST'], user=DATABASES['default']['USER'], passwd=DATABASES['default']['PASSWORD'], db=DATABASES['default']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) config_megam('/opt/packages') #classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) classifier = NaiveBayesClassifier.train(training_feature_set) #classifier.show_most_informative_features(50, show='pos') #classifier.show_most_informative_features(50, show='neg') #classifier.explain(training_feature_set[0][0]) #print training_feature_set[0] error_dict = {'+':0, '-':0, 'I':0, 'O':0} count_dict = {'+':0, '-':0, 'I':0, 'O':0} guess_dict = {'+':0, '-':0, 'I':0, 'O':0} full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, '-':{'+':0, '-':0, 'I':0, 'O':0}, 'I':{'+':0, '-':0, 'I':0, 'O':0}, 'O':{'+':0, '-':0, 'I':0, 'O':0}} count_table = {'+':0, '-':0, 'I':0, 'O':0} test_tweets = classify.get_test_tweets(conn) test_feature_set = classify.process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) #print count_table print "classifier accuracy: " + repr(classifier_accuracy)
def main_function(): conn_analysis = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_heart") training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = classify.process_tweets(training_tweets) tweets = classify.get_tweets_to_classify(conn_analysis); bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+':0, '-':0, 'I':0, 'O':0} for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+':0, '-':0, 'I':0, 'O':0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in tweets: text = classify.get_tweet_text(conn_analysis, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_max_ent_polarity(tweet[0], guess, conn_analysis) count_table[guess] += 1 #For the tweets where polarity was determined manually, copy from #majority_vote to auto_vote fix_manual_tweets(conn_analysis) print "Maximum Entropy" print count_table
# an implementation of the probabalistic chunker from NLTK import nltk import cPickle nltk.config_megam('/home/chris/programs/megam_0.92/megam') def tags_since_dt(sentence, i): tags = set() for word, pos in sentence[:i]: if pos == 'DT': tags = set() else: tags.add(pos) return '+'.join(sorted(tags)) def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i - 1] if i == len(sentence) - 1: nextword, nextpos = "<END>", "<END>" else: nextword, nextpos = sentence[i + 1] return { "pos": pos, "word": word, "prevpos": prevpos, "nextpos": nextpos,
most_suggestive_words = [w.rstrip() for w in words] f = open('../../data_files/yelp_sent_pos_text.txt') lines = f.readlines() f.close() tips_pos = [] for line in lines: tips_pos.append( eval(line) ) print 'Building feature set...', datetime.datetime.now() count = 0 featuresets = [] for tip, tag in tips_pos: features = doc_word_presence(tip) featuresets.append( (features, tag) ) count += 1 if count == int(len(tips_pos)): break size = int(len(featuresets)/2) train_set, test_set = featuresets[:size], featuresets[size:] print 'Training classifier...', datetime.datetime.now() #classifier = nltk.NaiveBayesClassifier.train(train_set) nltk.config_megam('/Users/admin/Downloads/megam_0.92/megam') classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam') print 'Finished training classifier', datetime.datetime.now() print nltk.classify.accuracy(classifier, test_set) show_confusion_matrix(classifier, test_set) write_probdist_for(classifier, '../../results/prob_dist/top_word_presence.txt')
# Natural Language Toolkit: code_classifier_chunker import nltk nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP') def tags_since_dt(sentence, i): tags = set() for word, pos in sentence[:i]: if pos == 'DT': tags = set() else: tags.add(pos) return '+'.join(sorted(tags)) def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i-1] if i == len(sentence)-1: nextword, nextpos = "<END>", "<END>" else: nextword, nextpos = sentence[i+1] return {"pos": pos, "word": word, "prevpos": prevpos, "nextpos": nextpos, "prevpos+pos": "%s+%s" % (prevpos, pos), "pos+nextpos": "%s+%s" % (pos, nextpos), "tags-since-dt": tags_since_dt(sentence, i)}
#!/usr/bin/python # -*- coding: utf-8 -*- import MySQLdb as mdb import sys import re from nltk import config_megam from nltk import MaxentClassifier from nltk import classify from numpy import array_split import os.path config_megam('/var/www/test/tagging/tools/db/tags/classifier/MEGAM/megam-64.opt') algorithm = 'MEGAM' def getClassesDict(): f = open('hindiclasses.sorted.txt') classesDict = dict() for line in f: fields = line.split() classesDict[fields[0]] = fields[1] f.close() return classesDict classesDict = getClassesDict(); def extractWordFeature(wordDict, sentence, i):
most_suggestive_words = most_freq_pos_words.union(most_freq_neg_words).union(most_freq_unk_words) f = open("../../data_files/yelp_sent_pos_text.txt") lines = f.readlines() f.close() tips_pos = [] for line in lines: tips_pos.append(eval(line)) print "Building feature set...", datetime.datetime.now() count = 0 featuresets = [] for tip, tag in tips_pos: features = doc_word_presence(tip) featuresets.append((features, tag)) count += 1 if count == int(len(tips_pos)): break size = int(len(featuresets) / 2) train_set, test_set = featuresets[:size], featuresets[size:] print "Training classifier...", datetime.datetime.now() # classifier = nltk.NaiveBayesClassifier.train(train_set) nltk.config_megam("/Users/admin/Downloads/megam_0.92/megam") classifier = nltk.MaxentClassifier.train(train_set, algorithm="megam") print "Finished training classifier", datetime.datetime.now() print nltk.classify.accuracy(classifier, test_set) show_confusion_matrix(classifier, test_set) write_probdist_for(classifier, "../../results/prob_dist/top_word_presence.txt")
#!/usr/bin/python import nltk #nltk.config_megam('/home/aritter/local/bin/megam') nltk.config_megam('/usr/local/bin/megam') import sys import math import random MAX_NGRAM=6 class StyleMetric: source_vector = {} target_vector = {} training = [] def __init__(self, corpus_source, corpus_target): #Read in Source Corpus nlines = 0 for line in open(corpus_source): #TODO: can use more data for training maxent model if more memory is available line = line.strip() words = nltk.word_tokenize(line) sentenceDict = {} for gram in range(1,MAX_NGRAM): for i in range(len(words)-gram+1): ngram = " ".join(words[i:i+gram]) self.source_vector[ngram] = 1 #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
Training: ClassifierTraining Prediction: Classifier """ import sys sys.path.append('../') import argparse import cPickle as p import operator import os import utils from ERG import AMR import nltk from nltk.classify import MaxentClassifier, accuracy nltk.config_megam("/usr/local/bin/megam.opt") from scipy.stats import rankdata class ClassifierTraining(object): def __init__(self, ftrain, fdev, ftest, wdir, delexicalized=True): self.train_amrs, self.dev_amrs, self.test_amrs = [], [], [] self.delexicalized = delexicalized print 'PARSING...' for f in os.listdir(ftrain): self.train_amrs.extend(self.parse(os.path.join(ftrain, f))) for f in os.listdir(fdev): self.dev_amrs.extend(self.parse(os.path.join(fdev, f)))
#!/usr/bin/python import nltk nltk.config_megam('/home/aritter/local/bin/megam') import sys import math import random MAX_NGRAM = 6 class StyleMetric: source_vector = {} target_vector = {} training = [] def __init__(self, corpus_source, corpus_target): #Read in Source Corpus nlines = 0 for line in open(corpus_source): #TODO: can use more data for training maxent model if more memory is available line = line.strip() words = nltk.word_tokenize(line) sentenceDict = {} for gram in range(1, MAX_NGRAM): for i in range(len(words) - gram + 1): ngram = " ".join(words[i:i + gram]) self.source_vector[ngram] = 1 #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
import nltk import os from cPickle import load, dump from nltk.corpus import conll2000 from settings import ROOT chunker_path = ROOT + 'vendor/parsers/consecutive_np_chunker.pk1' megam_path = ROOT + 'vendor/megam_i686.opt' nltk.config_megam(ROOT + 'vendor/megam_i686.opt') # Natural Language Toolkit: code_classifier_chunker def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i-1] return {'pos': pos, 'word': word, 'prevpos': prevpos} class ConsecutiveNPChunkTagger(nltk.TaggerI): # [_consec-chunk-tagger] def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = npchunk_features(untagged_sent, i, history) # [_consec-use-fe]
# Natural Language Toolkit: code_classifier_chunker import nltk nltk.config_megam('/Users/nishantagarwal/Documents/Projects/NLP') def tags_since_dt(sentence, i): tags = set() for word, pos in sentence[:i]: if pos == 'DT': tags = set() else: tags.add(pos) return '+'.join(sorted(tags)) def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i - 1] if i == len(sentence) - 1: nextword, nextpos = "<END>", "<END>" else: nextword, nextpos = sentence[i + 1] return { "pos": pos, "word": word, "prevpos": prevpos, "nextpos": nextpos,
def test_rte_classification_with_megam(self): try: config_megam() except (LookupError, AttributeError) as e: pytest.skip("Skipping tests with dependencies on MEGAM") clf = rte_classifier("megam", sample_N=100)
import time import re from collections import defaultdict from nltk import TaggerI, FreqDist, untag, config_megam from nltk.classify.maxent import MaxentClassifier PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam" config_megam(PATH_TO_MEGAM_EXECUTABLE) class MaxentPosTagger(TaggerI): def train(self, train_sents, algorithm='megam', rare_word_cutoff=5, rare_feat_cutoff=5, uppercase='[A-Z]', trace=3, **cutoffs): self.uppercase = uppercase self.word_freqdist = self.word_freqs(train_sents) self.featuresets = self.featsets(train_sents, rare_word_cutoff) self.features_freqdist = self.gen_the_feat_freqs(self.featuresets) self.cut_rare_feats(self.featuresets, rare_feat_cutoff) t1 = time.time() self.classifier = MaxentClassifier.train(self.featuresets, algorithm, trace, **cutoffs) t2 = time.time()
import time import collections import pickle from nltk.corpus import stopwords from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist from svmutil import * import itertools import math def remove_stopwords(text): r = stopwords.words('english') r.append('rt') return [w for w in text if not w in r] nltk.config_megam('.') start_time = time.time() connection = MongoClient('localhost', 27017) db = connection.local sad_col = db['neg_emoticons'] hap_col = db['pos_emoticons'] h, s = [], [] s = sad_col.find() h = hap_col.find() pos_tweets, neg_tweets = [], [] if len(sys.argv) > 2: count = int(sys.argv[2]) / 2
def active_megam(): if nltk.megam._megam_bin is None: import os path = os.getcwd() nltk.config_megam(path+'/megam/megam-64.opt')
def main_function(): conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'], user=DATABASES['ensemble']['USER'], passwd=DATABASES['ensemble']['PASSWORD'], db=DATABASES['ensemble']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} test_tweets = classify.get_test_tweets(conn) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_tweet_polarity_ensemble(tweet[0], guess, conn) count_table[guess] += 1 print "Maximum Entropy" print count_table #generate the accuracy matrix full_matrix = { '+': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, '-': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, 'I': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, 'O': { '+': 0, '-': 0, 'I': 0, 'O': 0 } } for tweet in test_tweets: result = classify.run_sql( conn, classify.Statements.CHECK_CONSENSUS % tweet[0]) guess = result[0][0] actual_result = classify.run_sql( conn, classify.Statements.CHECK_MAJORITY % tweet[0]) actual = actual_result[0][0] if guess is not None: if actual is not None: full_matrix[actual][guess] += 1 print full_matrix
#!/usr/bin/python import nltk nltk.config_megam('/home/aritter/local/bin/megam') import sys import math import random MAX_NGRAM=6 class StyleMetric: source_vector = {} target_vector = {} training = [] def __init__(self, corpus_source, corpus_target): #Read in Source Corpus nlines = 0 for line in open(corpus_source): #TODO: can use more data for training maxent model if more memory is available line = line.strip() words = nltk.word_tokenize(line) sentenceDict = {} for gram in range(1,MAX_NGRAM): for i in range(len(words)-gram+1): ngram = " ".join(words[i:i+gram]) self.source_vector[ngram] = 1 #self.source_vector[ngram] = self.source_vector.get(ngram, 0) + 1
pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.width', None) pd.set_option('display.max_colwidth', None) df = pd.DataFrame(confusionMatrix) df.columns = TAGS df.index = TAGS df.style print(df) print("Accuracy :", 100 * corr / total) if __name__ == "__main__": nltk.download("conll2000") nltk.config_megam('./megam-64.opt') corpus_train = nltk.corpus.conll2000.chunked_sents('train.txt') corpus_test = nltk.corpus.conll2000.chunked_sents("test.txt") def preprocess(sent): tgs = nltk.tree2conlltags(sent) tgs = [(w, pos, t[0]) for w, pos, t in tgs] return tgs TRAIN_DATA = [preprocess(sent) for sent in corpus_train] TEST_DATA = [preprocess(sent) for sent in corpus_test] cP = ChunkTagger(TRAIN_DATA) cP.evaluate(TEST_DATA) # TRAIN_DATA_FILE = open("../assignment2dataset/train.txt","r")
history = [] for i, word in enumerate(sentence): featureset = npchunk_features(sentence, i, history) tag = self.classifier.classify(featureset) history.append(tag) return zip(sentence, history) class ConsecutiveNPChunker(nltk.ChunkParserI): def __init__(self, train_sents): tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = ConsecutiveNPChunkTagger(tagged_sents) def parse(self, sentence): tagged_sents = self.tagger.tag(sentence) conlltags = [(w, t, c) for ((w, t), c) in tagged_sents] return nltk.chunk.util.conlltags2tree(conlltags) if __name__ == '__main__': test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) #unigram_chunker = BigramChunker(train_sents) # unigram_chunker = UnigramChunker(train_sents) #print unigram_chunker.evaluate(test_sents) nltk.config_megam(bin='/home/alpha/work/env/megam_0.92/./megam') chunker = ConsecutiveNPChunker(train_sents) print chunker.evaluate(test_sents)
import nltk import collections from nltk.corpus import treebank from nltk.tokenize import word_tokenize from nltk import config_megam import re import numpy as np #nltk.download('punkt') PATH_TO_MEGAM_EXECUTABLE = "./MEGAM/megam-64" config_megam(PATH_TO_MEGAM_EXECUTABLE) def get_pos_tagger(): train_sents = treebank.tagged_sents() tagger = nltk.TrigramTagger(train_sents, backoff= nltk.BigramTagger(train_sents, backoff= nltk.UnigramTagger(train_sents, backoff= nltk.DefaultTagger("NN")))) return tagger def get_ner_items(): items = [] with open("3c.txt", 'r') as f: for line in f: items.append(word_tokenize(line[:-1])) return items def get_ner_words(ner_items): ner_words = set() for ner_item in ner_items: for ner_word in ner_item:
such as persons, locations and organizations in a given document. (For the purpose of the training an external maximun entropy model (megam)\ is used. After the chunker has been created, it is pickled for subsequent use. """ import nltk import logging import pickle MEGAM_FOLDER = 'topics/megam_0.92/megam' try: nltk.config_megam(MEGAM_FOLDER) except LookupError: nltk.config_megam('megam_0.92/megam') # define logging configuration logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') class BigramChunker(nltk.ChunkParserI): """This class defines a bigram chunker""" def __init__(self, train_sents): """Construct a new BigramChunker instance. :param train_sents: Array of sentences with named entities tagged
def tags_since_dt(sentence, i): tags = set() for word, pos in sentence[:i]: if pos == 'DT': tags = set() else: tags.add(pos) return '+'.join(sorted(tags)) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) #train and saving the classifier nltk.config_megam('megam.exe') chunker = ConsecutiveNPChunker(train_sents) from pickle import dump output = open("chunkModelSVMFeat1.pkl", "wb") dump(chunker, output, -1) output.close() results = chunker.evaluate(test_sents) print() #print(results.incorrect()) print("-----------------------------") #print(results.missed()) print("----------------------------") #print(chunker.tagger.classifier.show_most_informative_features(n=20,show="all")) #print(chunker.classifier.explain(featureset,columns=4))
for sentence_tree in tree: eval_corpus.append(split_tree_tokens(sentence_tree)) # Evaluate model print 'Evaluating...' chunkscore = ChunkScore() for i, correct in enumerate(eval_corpus): guessed = chunker.parse(correct.leaves()) guessed = chunker._parse_to_tagged(guessed) chunkscore.score(correct, guessed) if i < 3: cmp_chunks(correct, guessed) print chunkscore return chunker _EVENT_TRAIN_DATA_PATH = resource_filename( Requirement.parse('cheshire3'), 'cheshire3/data/textmining/events/train') _EVENT_EVAL_DATA_PATH = resource_filename( Requirement.parse('cheshire3'), 'cheshire3/data/textmining/events/eval') config_megam(resource_filename( Requirement.parse('cheshire3'), 'cheshire3/data/textmining/megam_i686.opt')) if __name__ == "__main__": chunker = build_event_chunking_model()
nltk.download() in the Python interpreter. Proper usage of demo() and all other functions and methods is described below. """ import time import re import pickle from collections import defaultdict from nltk import TaggerI, FreqDist, untag, config_megam from nltk.classify.maxent import MaxentClassifier from nltk.corpus.reader.conll import ConllCorpusReader config_megam('/home/dsbatista/megam_i686.opt') class MaxentPosTagger(TaggerI): """ MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models. """ def train(self, train_sents, algorithm='megam', rare_word_cutoff=5, rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3, **cutoffs): """ MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged sentences. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. Each sentence is
such as persons, locations and organizations in a given document. (For the purpose of the training an external maximun entropy model (megam)\ is used. After the chunker has been created, it is pickled for subsequent use. """ import nltk import logging import pickle MEGAM_FOLDER = 'topics/megam_0.92/megam' try: nltk.config_megam(MEGAM_FOLDER) except LookupError: nltk.config_megam('megam_0.92/megam') # define logging configuration logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') class BigramChunker(nltk.ChunkParserI): """This class defines a bigram chunker""" def __init__(self, train_sents): """Construct a new BigramChunker instance. :param train_sents: Array of sentences with named entities tagged """