""" trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') tokens = [''.join(word) for word in corpus.words()] vocab = Counter(tokens) sents = list([word for word in sent] for sent in corpus.sents()) counter = count_ngrams(3, vocab, sents) knm = KneserNeyModel(counter) def complete(input_text): tokenized = nltk.word_tokenize(input_text) if len(tokenized) < 2: response = "Say more." else: completions = {} for sample in knm.samples(): if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]): completions[sample[2]] = knm.prob(sample) if len(completions) == 0:
# Create the TF-IDF Model and compute the scores model = gensim.models.TfidfModel(vectors) scores = model[vectors] for doc in scores: yield [ (lexicon[vec], score) for vec, score in doc ] if __name__ == '__main__': import heapq from reader import PickledCorpusReader from collections import Counter corpus = PickledCorpusReader('../corpus') scores = scored_document_phrases([ list(corpus.sents(fileids=fileid)) for fileid in corpus.fileids(categories=["politics", "news"]) ], True) tfidfs = Counter() for phrases in scores: for phrase, score in phrases: tfidfs[phrase] += score print( tabulate(tfidfs.most_common(50), headers=["keyphrase", "cumulative tfidf"]) )
""" trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') tokens = [''.join(word) for word in corpus.words()] vocab = Counter(tokens) sents = list([word[0] for word in sent] for sent in corpus.sents()) counter = count_ngrams(3, vocab, sents) knm = KneserNeyModel(counter) def complete(input_text): tokenized = nltk.word_tokenize(input_text) if len(tokenized) < 2: response = "Say more." else: completions = {} for sample in knm.samples(): if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]): completions[sample[2]] = knm.prob(sample) if len(completions) == 0: