Ejemplo n.º 1
0
def graph(docs):
    G = nx.Graph()
    for doc in docs:
        for pair in pairs(doc):
            if (pair[0][0], pair[1][0]) in G.edges():
                G.edges[(pair[0][0], pair[1][0])]['weight'] += 1
            else:
                G.add_edge(pair[0][0], pair[1][0], weight=1)
    return G


def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs):
    # Compute the centrality scores for each vertex
    scores = metric(G, **kwargs)

    # Set the score as a property on each node
    nx.set_node_attributes(G, name=attr, values=scores)

    # Find the top n scores and print them along with their index
    topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
    for idx, item in enumerate(topn):
        print("{}. {}: {:0.4f}".format(idx + 1, *item))

    return G


if __name__ == '__main__':
    corpus = PickledCorpusReader("../corpus")
    docs = corpus.docs()
    G = graph(docs)
Ejemplo n.º 2
0
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec
        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0]
        for fileid in corpus.fileids()
    ]

    model = Pipeline([
        ('norm', TextNormalizer()),
        ('vect', GensimTfidfVectorizer()),
        ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])
Ejemplo n.º 3
0
        self.model = Pipeline([('norm', TextNormalizer()),
                               ('vect', GensimTfidfVectorizer()),
                               ('model', self.estimator)])

    def fit(self, documents):
        self.model.fit(documents)

        return self.model


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    # With Sklearn
    skmodel = SklearnTopicModels(estimator='NMF')
    documents = corpus.docs()

    skmodel.fit_transform(documents)
    topics = skmodel.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic + 1))
        print(terms)

    # # With Gensim
    # gmodel = GensimTopicModels(estimator='LSA')
    #
    # docs = [
    #     list(corpus.docs(fileids=fileid))[0]
    #     for fileid in corpus.fileids()
    # ]
    #
Ejemplo n.º 4
0
from sklearn.model_selection import train_test_split as tts
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"]
docs = reader.fileids(categories=labels)
X = list(reader.docs(fileids=docs))
y = [reader.categories(fileids=[fileid])[0] for fileid in docs]
Ejemplo n.º 5
0
                trees = ne_chunk(sentence)
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = corpus.docs()

    phrase_extractor = KeyphraseExtractor()
    keyphrases = list(phrase_extractor.fit_transform(docs))
    print(keyphrases[0])

    entity_extractor = EntityExtractor()
    entities = list(entity_extractor.fit_transform(docs))
    print(entities[0])
Ejemplo n.º 6
0
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids()
    ]

    model = Pipeline([('norm', TextNormalizer()),
                      ('vect', GensimTfidfVectorizer()),
                      ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])
Ejemplo n.º 7
0
from sklearn.feature_extraction.text import TfidfVectorizer

from reader import PickledCorpusReader
from normalizer import TextNormalizer


def identity(words):
    return words


corpus = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample')

normalizer = TextNormalizer()
docs = normalizer.fit_transform(corpus.docs())

vectorizer = TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)

vectors = vectorizer.fit_transform(docs)

print(vectors.shape)
Ejemplo n.º 8
0
    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

    def fit(self, X, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.normalize(document[0])


if __name__ == '__main__':
    from reader import PickledCorpusReader

    reader = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample')

    normalizer = TextNormalizer()
    docs = normalizer.fit_transform(reader.docs())

    for i in range(2):
        print('\nDOC {}:'.format(i + 1))
        print(next(docs))
Ejemplo n.º 9
0
            ('vect', GensimTfidfVectorizer()),
            ('model', self.estimator)
        ])

    def fit(self, documents):
        self.model.fit(documents)

        return self.model


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    # With Sklearn
    skmodel = SklearnTopicModels(estimator='NMF')
    documents   = corpus.docs()

    skmodel.fit_transform(documents)
    topics = skmodel.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic+1))
        print(terms)

    # # With Gensim
    # gmodel = GensimTopicModels(estimator='LSA')
    #
    # docs = [
    #     list(corpus.docs(fileids=fileid))[0]
    #     for fileid in corpus.fileids()
    # ]
    #