from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format( category, n_docs, n_words))
from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize( corpus.words(fileid) ) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document)) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') clusterer = KMeansTopics(corpus, k=7) clusterer.cluster(corpus) # Classify documents in the new corpus by cluster affinity groups = [ (clusterer.classify(corpus.words(fileid)), fileid) for fileid in corpus.fileids(categories=['news']) ] # Group documents in corpus by cluster and display them groups.sort(key=itemgetter(0)) for group, items in groupby(groups, key=itemgetter(0)): for cluster, fname in items: print("Cluster {}: {}".format(cluster+1,fname))
""" Use KneserNeyProbDist from NLTK to get score """ trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') tokens = [''.join(word) for word in corpus.words()] vocab = Counter(tokens) sents = list([word for word in sent] for sent in corpus.sents()) counter = count_ngrams(3, vocab, sents) knm = KneserNeyModel(counter) def complete(input_text): tokenized = nltk.word_tokenize(input_text) if len(tokenized) < 2: response = "Say more." else: completions = {} for sample in knm.samples(): if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
""" Use KneserNeyProbDist from NLTK to get score """ trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') tokens = [''.join(word) for word in corpus.words()] vocab = Counter(tokens) sents = list([word[0] for word in sent] for sent in corpus.sents()) counter = count_ngrams(3, vocab, sents) knm = KneserNeyModel(counter) def complete(input_text): tokenized = nltk.word_tokenize(input_text) if len(tokenized) < 2: response = "Say more." else: completions = {} for sample in knm.samples(): if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
self.model = KMeansClusterer(self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize(corpus.words(fileid)) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document)) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') clusterer = KMeansTopics(corpus, k=7) clusterer.cluster(corpus) # Classify documents in the new corpus by cluster affinity groups = [(clusterer.classify(corpus.words(fileid)), fileid) for fileid in corpus.fileids(categories=['news'])] # Group documents in corpus by cluster and display them groups.sort(key=itemgetter(0)) for group, items in groupby(groups, key=itemgetter(0)): for cluster, fname in items: print("Cluster {}: {}".format(cluster + 1, fname))