def train(self): # Maybe some stemming later? docCount = {1:utility.Counter(), 2:utility.Counter()} for text in self.documents.itervalues(): docCount[1].update(set(utility.ngramFinder(text, 1))) docCount[2].update(set(utility.ngramFinder(text, 2))) print "Got doc count" categoryCount = utility.Counter() categoryWords = defaultdict(list) for q in Question.objects.all()[::100]: words = utility.wordParse(q.body) categoryWords[q.category] += words categoryWords[""] += words categoryBins = len(set(categoryWords[""])) del categoryWords[""] for category in categoryWords: categoryCount.update((word,) for word in set(categoryWords[category])) for category, words in categoryWords.items(): categoryWords[category] = WordDist(words, categoryBins) utility.wordFilter(categoryCount, len(categoryWords), categoryWords[category]) print "Trained Category" for label in self.documents: self.features[label] = NGramModel(2, self.documents[label], docCount, len(self.documents)) category = Label.objects.get(body=label).questions.all()[0].category self.features[label].addBackoff(categoryWords[category], categoryBins) print "Trained Wikipedia" return categoryCount
def __init__(self, n, text, docCount, numDoc): self.n = n self.model = [] for i in xrange(1, n + 1): ngrams = utility.ngramFinder(text, i) cwd = defaultdict(functools.partial(WordDist, None, len(docCount[i]))) for ngram in ngrams: context = tuple(ngram[:-1]) token = ngram[-1] cwd[context].add(token) for context in cwd.keys(): utility.wordFilter(docCount[i], numDoc, cwd[context], context) self.model.append(cwd)