class NGramSentences: def __init__(self, n=3, filename='cache/book.txt'): with open(filename) as file: text = file.read() tokens = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text) ] train, vocab = padded_everygram_pipeline(3, tokens) self.model = Laplace(n) self.model.fit(train, vocab) def generate(self, prev_word='<s>', max_words=25): return detokenize( list( itertools.takewhile( lambda word: word != '</s>', itertools.dropwhile( lambda word: word == '<s>', (word for word in self.model.generate( max_words, text_seed=[prev_word]))))))
bigramsList = list(map(lambda x: list(trigrams(x)), y)) bigramsList = list(flatten(bigramsList)) #list(everygrams(bigramsList, max_len=2)) vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text)) from nltk.lm import Vocabulary vocab = list(Vocabulary(vocab, unk_cutoff=1)) ''' from nltk.lm.preprocessing import padded_everygram_pipeline train, vocab = padded_everygram_pipeline(2, text) ''' lm = Laplace(3) lm.fit([bigramsList], vocabulary_text=list(vocab)) lm.generate(4, text_seed=["government", "had"]) def generateSentences(v): sent = v v = [lm.generate(1, text_seed=v)] sent = sent + v while v[0] != '</s>': l = len(sent) v = [lm.generate(1, text_seed=[sent[l - 2], sent[l - 1]])] sent = sent + v return sent sen = generateSentences(['<s>', 'india']) sen = " ".join(sen)