Python PickledCorpusReader.fileids Examples

Programming Language: Python

Namespace/Package Name: reader

Method/Function: fileids

Examples at hotexamples.com: 9

Python PickledCorpusReader.fileids - 9 examples found. These are the top rated real world Python examples of reader.PickledCorpusReader.fileids extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PickledCorpusReader(16)

docs(6)

fileids(6)

words(3)

categories(2)

sents(2)

Example #1

Show file

File: transformers.py Project: yokeyong/atap

        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec
        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0]
        for fileid in corpus.fileids()
    ]

    model = Pipeline([
        ('norm', TextNormalizer()),
        ('vect', GensimTfidfVectorizer()),
        ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])

Example #2

Show file

from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(
        category, n_docs, n_words))

Example #3

Show file

File: splits.py Project: AmalfiTrader/Text-Analysis

from sklearn.model_selection import train_test_split as tts
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"]
docs = reader.fileids(categories=labels)
X = list(reader.docs(fileids=docs))
y = [reader.categories(fileids=[fileid])[0] for fileid in docs]

Example #4

Show file

File: info.py Project: yokeyong/atap

from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))

Example #5

Show file

File: candidates.py Project: TinaCloud/atap

    # Create the TF-IDF Model and compute the scores
    model = gensim.models.TfidfModel(vectors)
    scores = model[vectors]

    for doc in scores:
        yield [
            (lexicon[vec], score) for vec, score in doc
        ]


if __name__ == '__main__':

    import heapq

    from reader import PickledCorpusReader
    from collections import Counter

    corpus = PickledCorpusReader('../corpus')
    scores = scored_document_phrases([
        list(corpus.sents(fileids=fileid)) for fileid in corpus.fileids(categories=["politics", "news"])
    ], True)
    tfidfs = Counter()

    for phrases in scores:
        for phrase, score in phrases:
            tfidfs[phrase] += score

    print(
        tabulate(tfidfs.most_common(50), headers=["keyphrase", "cumulative tfidf"])
    )

Example #6

Show file

File: kmeans.py Project: yokeyong/atap

            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups  = [
        (clusterer.classify(corpus.words(fileid)), fileid)
        for fileid in corpus.fileids(categories=['news'])
    ]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster+1,fname))

Example #7

Show file

        indices = train_idx if train else test_idx
        return [
            fileid for doc_idx, fileid in enumerate(self.corpus.fileids())
            if doc_idx in indices
        ]

    def documents(self, fold=None, train=False, test=False):
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.docs(fileids=fileid))

    def labels(self, fold=None, train=False, test=False):
        return [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.fileids(fold, train, test)
        ]


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('corpus/tagcorpusoracle')
    for para in corpus.fileids(categories='281550031684823'):
        print(para)

    loader = CorpusLoader(corpus,12)

    for fid in loader.fileids(0, test=True):
        print(fid)


    print(loader.labels(0, test=True))

Example #8

Show file

File: transformers.py Project: AmalfiTrader/Text-Analysis

        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids()
    ]

    model = Pipeline([('norm', TextNormalizer()),
                      ('vect', GensimTfidfVectorizer()),
                      ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])

Example #9

Show file

        self.model = KMeansClusterer(self.k,
                                     distance=cosine,
                                     avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(corpus.words(fileid))
            for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups = [(clusterer.classify(corpus.words(fileid)), fileid)
              for fileid in corpus.fileids(categories=['news'])]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster + 1, fname))