return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([ ('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])
from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format( category, n_docs, n_words))
from sklearn.model_selection import train_test_split as tts from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"] docs = reader.fileids(categories=labels) X = list(reader.docs(fileids=docs)) y = [reader.categories(fileids=[fileid])[0] for fileid in docs]
from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
# Create the TF-IDF Model and compute the scores model = gensim.models.TfidfModel(vectors) scores = model[vectors] for doc in scores: yield [ (lexicon[vec], score) for vec, score in doc ] if __name__ == '__main__': import heapq from reader import PickledCorpusReader from collections import Counter corpus = PickledCorpusReader('../corpus') scores = scored_document_phrases([ list(corpus.sents(fileids=fileid)) for fileid in corpus.fileids(categories=["politics", "news"]) ], True) tfidfs = Counter() for phrases in scores: for phrase, score in phrases: tfidfs[phrase] += score print( tabulate(tfidfs.most_common(50), headers=["keyphrase", "cumulative tfidf"]) )
self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize( corpus.words(fileid) ) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document)) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') clusterer = KMeansTopics(corpus, k=7) clusterer.cluster(corpus) # Classify documents in the new corpus by cluster affinity groups = [ (clusterer.classify(corpus.words(fileid)), fileid) for fileid in corpus.fileids(categories=['news']) ] # Group documents in corpus by cluster and display them groups.sort(key=itemgetter(0)) for group, items in groupby(groups, key=itemgetter(0)): for cluster, fname in items: print("Cluster {}: {}".format(cluster+1,fname))
indices = train_idx if train else test_idx return [ fileid for doc_idx, fileid in enumerate(self.corpus.fileids()) if doc_idx in indices ] def documents(self, fold=None, train=False, test=False): for fileid in self.fileids(fold, train, test): yield list(self.corpus.docs(fileids=fileid)) def labels(self, fold=None, train=False, test=False): return [ self.corpus.categories(fileids=fileid)[0] for fileid in self.fileids(fold, train, test) ] if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('corpus/tagcorpusoracle') for para in corpus.fileids(categories='281550031684823'): print(para) loader = CorpusLoader(corpus,12) for fid in loader.fileids(0, test=True): print(fid) print(loader.labels(0, test=True))
self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])
self.model = KMeansClusterer(self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize(corpus.words(fileid)) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document)) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') clusterer = KMeansTopics(corpus, k=7) clusterer.cluster(corpus) # Classify documents in the new corpus by cluster affinity groups = [(clusterer.classify(corpus.words(fileid)), fileid) for fileid in corpus.fileids(categories=['news'])] # Group documents in corpus by cluster and display them groups.sort(key=itemgetter(0)) for group, items in groupby(groups, key=itemgetter(0)): for cluster, fname in items: print("Cluster {}: {}".format(cluster + 1, fname))