Example #1
2
import cPickle as pickle
import random
from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist
import nltk

print "... loading text"
text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt'))
print len(set(text_train))
text_test = list(nltk.corpus.gutenberg.words('austen-sense.txt'))

#with open('./../datasets/t5_train') as f:
#    text_train =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_train)
#    text_train = (' . '.join(text_train)).split(' ')
#    
#with open('./../datasets/t5_test') as f:
#    text_test =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_test)
#    text_test = (' . '.join(text_test)).split(' ')

print "... training model"
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm = NgramModel(3, text_train, estimator=estimator)

print "... results"
print lm.generate(50, ['dog'])
print lm.perplexity(text_test)
print lm.entropy(text_test)
Example #2
0
#!/usr/bin/env python
import nltk
from nltk import bigrams
from nltk import trigrams
from nltk.probability import LidstoneProbDist  
from nltk.model import NgramModel    

with open('./austen/persuasion.txt', 'r') as training_file:
    raw = training_file.read() 
tokens = nltk.word_tokenize(raw)

with open('./austen/sense_and_sensibility.txt', 'r') as test_file:
    test = test_file.read()
test_list = nltk.word_tokenize(test)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
model = NgramModel(3, tokens,True,False,estimator)  
tri=model.entropy(test_list)
print "tri-gram: " + str(tri)

model = NgramModel(2, tokens,True,False,estimator)  
bi=model.entropy(test_list)
print "bi-gram: " + str(bi)

Example #3
0
#!/usr/bin/env python
import nltk
from nltk import bigrams
from nltk import trigrams
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel

with open('./austen/persuasion.txt', 'r') as training_file:
    raw = training_file.read()
tokens = nltk.word_tokenize(raw)

with open('./austen/sense_and_sensibility.txt', 'r') as test_file:
    test = test_file.read()
test_list = nltk.word_tokenize(test)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
model = NgramModel(3, tokens, True, False, estimator)
tri = model.entropy(test_list)
print "tri-gram: " + str(tri)

model = NgramModel(2, tokens, True, False, estimator)
bi = model.entropy(test_list)
print "bi-gram: " + str(bi)
Example #4
0
from nltk.util import ngrams
from nltk.corpus import reuters  
from nltk.corpus import genesis  
from nltk.probability import LaplaceProbDist  
from nltk.model import NgramModel
import nltk

sentence = 'She covered a Bob Dylan song for Amnesty International.'

## http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf
## http://www.nltk.org/book/ch02.html

n = 2
bigrams = ngrams(sentence.split(), n)

print bigrams

## Append starting points and ending points

#for grams in sixgrams:
#    print grams
    
estimator = lambda fdist, bins: LaplaceProbDist(fdist, len(sentence.split())+1)

model = NgramModel(2,sentence.split(),estimator=estimator)  

print model.generate(1, ("her","take"))
print 
print model.entropy(["she","covered"])
Example #5
0
def demo():
    from nltk.corpus import treebank 
    #from nltk.probability import LidstoneProbDist
    #from nltk.probability import WittenBellProbDist
    from nltk.probability import SimpleGoodTuringProbDist
    from nltk.model import NgramModel
    estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, len(fdist)+1) 
    #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
    #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) 
    
    tag_corpus = []
    for (word,tag) in treebank.tagged_words():
        tag_corpus.append(tag)
    lm = NgramModel(2, tag_corpus, estimator) 
    print lm 
    lm1 = NgramModel(1, tag_corpus, estimator) 
    print lm1 
    print tag_corpus[:20]

    sent = "NN"
    print lm1.entropy(sent) 
    
    sent = "DT "
    print lm1.entropy(sent) 

    sent = "VBZ"
    print lm1.entropy(sent) 
    
    sent = "JJ"
    print lm1.entropy(sent) 
    
    sent = "RB"
    print lm1.entropy(sent) 
    
    sent = "DT NN"
    print lm.entropy(sent) 
Example #6
0
		word = word.lower()
		corpus_tokens.append(word)
	print "Adding gutenberg"
	for word in gutenberg.words():
		word = word.lower()
		corpus_tokens.append(word)
	print "Training Trigram Model"
	lm = NgramModel(3,corpus_tokens,True,False,lambda f,b:LidstoneProbDist(f,0.01,f.B()+1))

	tweet_entropies = []
	count = 1
	for sent in useful:
		sent = sent.split()
		percentage = 100*count/len(useful)
		print "\rChecking entropy : " + str(count) + " of " + str(len(useful)) + "        " + str(percentage) + "%",
		entropy = lm.entropy(sent)
		tweet_entropies.append((" ".join(sent), entropy))
		count += 1
	tweet_entropies.sort(key=lambda x: x[1])
	threshold = int(len(tweet_entropies) * 0.8)
	list_of_tweets = tweet_entropies[:threshold]

	print "\n",

	final_tweets = [a for (a,b) in list_of_tweets]

	count = 1
	for tweet in final_tweets:
		percentage = 100*count/len(final_tweets)
		print '\rWriting: ' + str(count) + " of " + str(len(final_tweets)) + "     " + str(percentage) + "%",
		writer.write(tweet + "\n")
Example #7
0
tamil2_alpha = [] 
tamil2_alpha_all = []


for line in tamil1f.readlines()[1:]:
    tamil1_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"])
    tamil1_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]

for line in tamil2f.readlines()[1:]:
    tamil2_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"])
    tamil2_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]

        
s_bg1 = nltk.bigrams(tamil1_alpha_all)
s_bg2 = nltk.bigrams(tamil2_alpha_all)

fdist1 = nltk.FreqDist(s_bg1)
fdist2 = nltk.FreqDist(s_bg2)

estimator1 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil1_alpha_all)+1)
estimator2 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil2_alpha_all)+1)

model1 = NgramModel(3,tamil1_alpha_all,estimator=estimator1)  
model2 = NgramModel(3,tamil2_alpha_all,estimator=estimator2)

print model1.entropy(tamil1_alpha[0])
print model1.perplexity(tamil1_alpha[0])