Esempio n. 1
0
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(
        category, n_docs, n_words))
Esempio n. 2
0
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
Esempio n. 3
0
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups  = [
        (clusterer.classify(corpus.words(fileid)), fileid)
        for fileid in corpus.fileids(categories=['news'])
    ]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster+1,fname))
Esempio n. 4
0
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')
    tokens = [''.join(word) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word for word in sent] for sent in corpus.sents())

    counter = count_ngrams(3, vocab, sents)
    knm = KneserNeyModel(counter)

    def complete(input_text):
        tokenized = nltk.word_tokenize(input_text)

        if len(tokenized) < 2:
            response = "Say more."
        else:
            completions = {}
            for sample in knm.samples():
                if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
Esempio n. 5
0
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')
    tokens = [''.join(word) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word[0] for word in sent] for sent in corpus.sents())

    counter = count_ngrams(3, vocab, sents)
    knm = KneserNeyModel(counter)


    def complete(input_text):
        tokenized = nltk.word_tokenize(input_text)
        if len(tokenized) < 2:
            response = "Say more."
        else:
            completions = {}
            for sample in knm.samples():
                if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
Esempio n. 6
0
        self.model = KMeansClusterer(self.k,
                                     distance=cosine,
                                     avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(corpus.words(fileid))
            for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups = [(clusterer.classify(corpus.words(fileid)), fileid)
              for fileid in corpus.fileids(categories=['news'])]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster + 1, fname))