def train(self, corpus="", yesTagNames=None, noTagNames=None): logger = logging.getLogger("Trainer.train") success = False categories = [] try: document = Document.getDocumentByCorpus(corpus) if not document: features = self.__featuredExtractor.getFeatures(corpus) categories = self.__getCategoriesFromNames(yesTagNames, noTagNames) document = Document(corpus=corpus) document.save() documentCounts = {} for category in categories: self.__incrementCategoryCount(documentCounts, category) DocumentCategoryCounts(document=document, countData=self._jsonEncoder.encode(documentCounts)).save() for feature in features: featureCount, _ = FeatureCounts.objects.get_or_create(featureName=feature) counts = self._jsonDecoder.decode(featureCount.countData) if featureCount.countData else {} for category in categories: self.__incrementCategoryCount(counts, category) featureCount.countData = self._jsonEncoder.encode(counts) featureCount.save() # We keep an index of category document counts for faster classification later on catDocCountIndex = CategoryDocumentCountIndex.getCountIndex() index = self._jsonDecoder.decode(catDocCountIndex.countData) if catDocCountIndex.countData else {} for category in categories: self.__incrementCategoryCount(index, category) catDocCountIndex.countData = self._jsonEncoder.encode(index) catDocCountIndex.save() success = True transaction.commit() else: logger.info("Document already exists: " + str(document.id) + " - " + document.corpusHash) success = True except Exception, ex: logger.info("Bad data:%s" % corpus) logger.exception("Failed to save the trained data: " + str(ex)) transaction.rollback()
def untrain(self, corpus=""): logger = logging.getLogger("Trainer.untrain") success = False try: document = Document.getDocumentByCorpus(corpus) if document: categories = DocumentCategoryCounts.getCategoriesForDocument(document) features = self.__featuredExtractor.getFeatures(corpus) document.delete() for feature in features: featureCount, _ = FeatureCounts.objects.get_or_create(featureName=feature) counts = self._jsonDecoder.decode(featureCount.countData) if featureCount.countData else {} for category in categories: self.__decrementCategoryCount(counts, category) featureCount.countData = self._jsonEncoder.encode(counts) featureCount.save() # We keep an index of category document counts for faster classification later on catDocCountIndex = CategoryDocumentCountIndex.getCountIndex() index = self._jsonDecoder.decode(catDocCountIndex.countData) if catDocCountIndex.countData else {} for category in categories: self.__decrementCategoryCount(index, category) catDocCountIndex.countData = self._jsonEncoder.encode(index) catDocCountIndex.save() success = True transaction.commit() else: logger.info("Document doesn't exist") success = True except Exception, ex: logger.exception("Failed to untrain the document: " + str(ex)) transaction.rollback()