import cPickle as pickle import random from nltk.model import NgramModel from nltk.probability import LidstoneProbDist import nltk print "... loading text" text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt')) print len(set(text_train)) text_test = list(nltk.corpus.gutenberg.words('austen-sense.txt')) #with open('./../datasets/t5_train') as f: # text_train =(' '.join(pickle.load(f))).split(' . ') # random.shuffle(text_train) # text_train = (' . '.join(text_train)).split(' ') # #with open('./../datasets/t5_test') as f: # text_test =(' '.join(pickle.load(f))).split(' . ') # random.shuffle(text_test) # text_test = (' . '.join(text_test)).split(' ') print "... training model" estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, text_train, estimator=estimator) print "... results" print lm.generate(50, ['dog']) print lm.perplexity(text_test) print lm.entropy(text_test)
#!/usr/bin/env python import nltk from nltk import bigrams from nltk import trigrams from nltk.probability import LidstoneProbDist from nltk.model import NgramModel with open('./austen/persuasion.txt', 'r') as training_file: raw = training_file.read() tokens = nltk.word_tokenize(raw) with open('./austen/sense_and_sensibility.txt', 'r') as test_file: test = test_file.read() test_list = nltk.word_tokenize(test) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(3, tokens,True,False,estimator) tri=model.entropy(test_list) print "tri-gram: " + str(tri) model = NgramModel(2, tokens,True,False,estimator) bi=model.entropy(test_list) print "bi-gram: " + str(bi)
#!/usr/bin/env python import nltk from nltk import bigrams from nltk import trigrams from nltk.probability import LidstoneProbDist from nltk.model import NgramModel with open('./austen/persuasion.txt', 'r') as training_file: raw = training_file.read() tokens = nltk.word_tokenize(raw) with open('./austen/sense_and_sensibility.txt', 'r') as test_file: test = test_file.read() test_list = nltk.word_tokenize(test) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(3, tokens, True, False, estimator) tri = model.entropy(test_list) print "tri-gram: " + str(tri) model = NgramModel(2, tokens, True, False, estimator) bi = model.entropy(test_list) print "bi-gram: " + str(bi)
from nltk.util import ngrams from nltk.corpus import reuters from nltk.corpus import genesis from nltk.probability import LaplaceProbDist from nltk.model import NgramModel import nltk sentence = 'She covered a Bob Dylan song for Amnesty International.' ## http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf ## http://www.nltk.org/book/ch02.html n = 2 bigrams = ngrams(sentence.split(), n) print bigrams ## Append starting points and ending points #for grams in sixgrams: # print grams estimator = lambda fdist, bins: LaplaceProbDist(fdist, len(sentence.split())+1) model = NgramModel(2,sentence.split(),estimator=estimator) print model.generate(1, ("her","take")) print print model.entropy(["she","covered"])
def demo(): from nltk.corpus import treebank #from nltk.probability import LidstoneProbDist #from nltk.probability import WittenBellProbDist from nltk.probability import SimpleGoodTuringProbDist from nltk.model import NgramModel estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, len(fdist)+1) #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) tag_corpus = [] for (word,tag) in treebank.tagged_words(): tag_corpus.append(tag) lm = NgramModel(2, tag_corpus, estimator) print lm lm1 = NgramModel(1, tag_corpus, estimator) print lm1 print tag_corpus[:20] sent = "NN" print lm1.entropy(sent) sent = "DT " print lm1.entropy(sent) sent = "VBZ" print lm1.entropy(sent) sent = "JJ" print lm1.entropy(sent) sent = "RB" print lm1.entropy(sent) sent = "DT NN" print lm.entropy(sent)
word = word.lower() corpus_tokens.append(word) print "Adding gutenberg" for word in gutenberg.words(): word = word.lower() corpus_tokens.append(word) print "Training Trigram Model" lm = NgramModel(3,corpus_tokens,True,False,lambda f,b:LidstoneProbDist(f,0.01,f.B()+1)) tweet_entropies = [] count = 1 for sent in useful: sent = sent.split() percentage = 100*count/len(useful) print "\rChecking entropy : " + str(count) + " of " + str(len(useful)) + " " + str(percentage) + "%", entropy = lm.entropy(sent) tweet_entropies.append((" ".join(sent), entropy)) count += 1 tweet_entropies.sort(key=lambda x: x[1]) threshold = int(len(tweet_entropies) * 0.8) list_of_tweets = tweet_entropies[:threshold] print "\n", final_tweets = [a for (a,b) in list_of_tweets] count = 1 for tweet in final_tweets: percentage = 100*count/len(final_tweets) print '\rWriting: ' + str(count) + " of " + str(len(final_tweets)) + " " + str(percentage) + "%", writer.write(tweet + "\n")
tamil2_alpha = [] tamil2_alpha_all = [] for line in tamil1f.readlines()[1:]: tamil1_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]) tamil1_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"] for line in tamil2f.readlines()[1:]: tamil2_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]) tamil2_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"] s_bg1 = nltk.bigrams(tamil1_alpha_all) s_bg2 = nltk.bigrams(tamil2_alpha_all) fdist1 = nltk.FreqDist(s_bg1) fdist2 = nltk.FreqDist(s_bg2) estimator1 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil1_alpha_all)+1) estimator2 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil2_alpha_all)+1) model1 = NgramModel(3,tamil1_alpha_all,estimator=estimator1) model2 = NgramModel(3,tamil2_alpha_all,estimator=estimator2) print model1.entropy(tamil1_alpha[0]) print model1.perplexity(tamil1_alpha[0])