Ejemplo n.º 1
0
    def train(self):
        # Maybe some stemming later?
        docCount = {1:utility.Counter(), 2:utility.Counter()}

        for text in self.documents.itervalues():
            docCount[1].update(set(utility.ngramFinder(text, 1)))
            docCount[2].update(set(utility.ngramFinder(text, 2)))
        print "Got doc count"

        categoryCount = utility.Counter()
        categoryWords = defaultdict(list)
        for q in Question.objects.all()[::100]:
            words = utility.wordParse(q.body)

            categoryWords[q.category] += words
            categoryWords[""] += words

        categoryBins = len(set(categoryWords[""]))
        del categoryWords[""]

        for category in categoryWords:
            categoryCount.update((word,) for word in set(categoryWords[category]))

        for category, words in categoryWords.items():
            categoryWords[category] = WordDist(words, categoryBins)
            utility.wordFilter(categoryCount, len(categoryWords), 
                               categoryWords[category])
        print "Trained Category"

        for label in self.documents:
            self.features[label] = NGramModel(2, self.documents[label],
                                              docCount, len(self.documents))
            category = Label.objects.get(body=label).questions.all()[0].category
            self.features[label].addBackoff(categoryWords[category],
                                            categoryBins)
        print "Trained Wikipedia"

        return categoryCount
Ejemplo n.º 2
0
    def __init__(self, n, text, docCount, numDoc):
        self.n = n
        self.model = []

        for i in xrange(1, n + 1):
            ngrams = utility.ngramFinder(text, i)
            cwd = defaultdict(functools.partial(WordDist, None, len(docCount[i])))

            for ngram in ngrams:
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cwd[context].add(token)

            for context in cwd.keys():
                utility.wordFilter(docCount[i], numDoc, cwd[context], context)

            self.model.append(cwd)