def train(self): trainingData, allFeatures, docCount = [], set(), utility.Counter() paragraphCounter = 0 for label, text in self.documents.items(): for paragraph in text.split("\n"): d = defaultdict(bool) words = set(utility.wordParse(paragraph)) docCount.update(words) paragraphCounter += 1 for word in words: d[word] = True allFeatures.add(word) trainingData.append((label, d)) print len(allFeatures) self.tree = BoolDecisionTree(trainingData, allFeatures)
def train(self): # Maybe some stemming later? docCount = {1:utility.Counter(), 2:utility.Counter()} for text in self.documents.itervalues(): docCount[1].update(set(utility.ngramFinder(text, 1))) docCount[2].update(set(utility.ngramFinder(text, 2))) print "Got doc count" categoryCount = utility.Counter() categoryWords = defaultdict(list) for q in Question.objects.all()[::100]: words = utility.wordParse(q.body) categoryWords[q.category] += words categoryWords[""] += words categoryBins = len(set(categoryWords[""])) del categoryWords[""] for category in categoryWords: categoryCount.update((word,) for word in set(categoryWords[category])) for category, words in categoryWords.items(): categoryWords[category] = WordDist(words, categoryBins) utility.wordFilter(categoryCount, len(categoryWords), categoryWords[category]) print "Trained Category" for label in self.documents: self.features[label] = NGramModel(2, self.documents[label], docCount, len(self.documents)) category = Label.objects.get(body=label).questions.all()[0].category self.features[label].addBackoff(categoryWords[category], categoryBins) print "Trained Wikipedia" return categoryCount