Exemple #1
0
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')
    tokens = [''.join(word) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word for word in sent] for sent in corpus.sents())

    counter = count_ngrams(3, vocab, sents)
    knm = KneserNeyModel(counter)

    def complete(input_text):
        tokenized = nltk.word_tokenize(input_text)

        if len(tokenized) < 2:
            response = "Say more."
        else:
            completions = {}
            for sample in knm.samples():
                if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
                    completions[sample[2]] = knm.prob(sample)
            if len(completions) == 0:
Exemple #2
0
    # Create the TF-IDF Model and compute the scores
    model = gensim.models.TfidfModel(vectors)
    scores = model[vectors]

    for doc in scores:
        yield [
            (lexicon[vec], score) for vec, score in doc
        ]


if __name__ == '__main__':

    import heapq

    from reader import PickledCorpusReader
    from collections import Counter

    corpus = PickledCorpusReader('../corpus')
    scores = scored_document_phrases([
        list(corpus.sents(fileids=fileid)) for fileid in corpus.fileids(categories=["politics", "news"])
    ], True)
    tfidfs = Counter()

    for phrases in scores:
        for phrase, score in phrases:
            tfidfs[phrase] += score

    print(
        tabulate(tfidfs.most_common(50), headers=["keyphrase", "cumulative tfidf"])
    )
Exemple #3
0
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')
    tokens = [''.join(word) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word[0] for word in sent] for sent in corpus.sents())

    counter = count_ngrams(3, vocab, sents)
    knm = KneserNeyModel(counter)


    def complete(input_text):
        tokenized = nltk.word_tokenize(input_text)
        if len(tokenized) < 2:
            response = "Say more."
        else:
            completions = {}
            for sample in knm.samples():
                if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]):
                    completions[sample[2]] = knm.prob(sample)
            if len(completions) == 0: