def graph(docs): G = nx.Graph() for doc in docs: for pair in pairs(doc): if (pair[0][0], pair[1][0]) in G.edges(): G.edges[(pair[0][0], pair[1][0])]['weight'] += 1 else: G.add_edge(pair[0][0], pair[1][0], weight=1) return G def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs): # Compute the centrality scores for each vertex scores = metric(G, **kwargs) # Set the score as a property on each node nx.set_node_attributes(G, name=attr, values=scores) # Find the top n scores and print them along with their index topn = heapq.nlargest(n, scores.items(), key=itemgetter(1)) for idx, item in enumerate(topn): print("{}. {}: {:0.4f}".format(idx + 1, *item)) return G if __name__ == '__main__': corpus = PickledCorpusReader("../corpus") docs = corpus.docs() G = graph(docs)
return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([ ('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])
self.model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('model', self.estimator)]) def fit(self, documents): self.model.fit(documents) return self.model if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') # With Sklearn skmodel = SklearnTopicModels(estimator='NMF') documents = corpus.docs() skmodel.fit_transform(documents) topics = skmodel.get_topics() for topic, terms in topics.items(): print("Topic #{}:".format(topic + 1)) print(terms) # # With Gensim # gmodel = GensimTopicModels(estimator='LSA') # # docs = [ # list(corpus.docs(fileids=fileid))[0] # for fileid in corpus.fileids() # ] #
from sklearn.model_selection import train_test_split as tts from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"] docs = reader.fileids(categories=labels) X = list(reader.docs(fileids=docs)) y = [reader.categories(fileids=[fileid])[0] for fileid in docs]
trees = ne_chunk(sentence) for tree in trees: if hasattr(tree, 'label'): if tree.label() in self.labels: entities.append( ' '.join([child[0].lower() for child in tree]) ) return entities def fit(self, documents, labels=None): return self def transform(self, documents): for document in documents: yield self.get_entities(document) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = corpus.docs() phrase_extractor = KeyphraseExtractor() keyphrases = list(phrase_extractor.fit_transform(docs)) print(keyphrases[0]) entity_extractor = EntityExtractor() entities = list(entity_extractor.fit_transform(docs)) print(entities[0])
self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])
from sklearn.feature_extraction.text import TfidfVectorizer from reader import PickledCorpusReader from normalizer import TextNormalizer def identity(words): return words corpus = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample') normalizer = TextNormalizer() docs = normalizer.fit_transform(corpus.docs()) vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False) vectors = vectorizer.fit_transform(docs) print(vectors.shape)
def lemmatize(self, token, pos_tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(pos_tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag) def fit(self, X, y=None): return self def transform(self, documents): for document in documents: yield self.normalize(document[0]) if __name__ == '__main__': from reader import PickledCorpusReader reader = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample') normalizer = TextNormalizer() docs = normalizer.fit_transform(reader.docs()) for i in range(2): print('\nDOC {}:'.format(i + 1)) print(next(docs))
('vect', GensimTfidfVectorizer()), ('model', self.estimator) ]) def fit(self, documents): self.model.fit(documents) return self.model if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') # With Sklearn skmodel = SklearnTopicModels(estimator='NMF') documents = corpus.docs() skmodel.fit_transform(documents) topics = skmodel.get_topics() for topic, terms in topics.items(): print("Topic #{}:".format(topic+1)) print(terms) # # With Gensim # gmodel = GensimTopicModels(estimator='LSA') # # docs = [ # list(corpus.docs(fileids=fileid))[0] # for fileid in corpus.fileids() # ] #