def loadAllDocumentCounts(self):
     logger = logging.getLogger("ClassifierIndex.loadAllDocumentCounts")
     try :
         countIndex = CategoryDocumentCountIndex.getCountIndex()
         self.__documentCountHash = self.__jsonDecoder.decode(countIndex.countData) if countIndex.countData else {}
     except Exception, ex :
         logger.exception("Failed to load all documents counts: " + str(ex))
         raise ClassifyIndexLoadFailure("Failed to load all document counts: " + str(ex))
Example #2
0
    def train(self, corpus="", yesTagNames=None, noTagNames=None):
        logger = logging.getLogger("Trainer.train")
        success = False

        categories = []
        try:
            document = Document.getDocumentByCorpus(corpus)
            if not document:
                features = self.__featuredExtractor.getFeatures(corpus)
                categories = self.__getCategoriesFromNames(yesTagNames, noTagNames)

                document = Document(corpus=corpus)
                document.save()
                documentCounts = {}
                for category in categories:
                    self.__incrementCategoryCount(documentCounts, category)
                DocumentCategoryCounts(document=document, countData=self._jsonEncoder.encode(documentCounts)).save()

                for feature in features:
                    featureCount, _ = FeatureCounts.objects.get_or_create(featureName=feature)
                    counts = self._jsonDecoder.decode(featureCount.countData) if featureCount.countData else {}

                    for category in categories:
                        self.__incrementCategoryCount(counts, category)

                    featureCount.countData = self._jsonEncoder.encode(counts)
                    featureCount.save()

                # We keep an index of category document counts for faster classification later on
                catDocCountIndex = CategoryDocumentCountIndex.getCountIndex()
                index = self._jsonDecoder.decode(catDocCountIndex.countData) if catDocCountIndex.countData else {}
                for category in categories:
                    self.__incrementCategoryCount(index, category)
                catDocCountIndex.countData = self._jsonEncoder.encode(index)
                catDocCountIndex.save()

                success = True

                transaction.commit()
            else:
                logger.info("Document already exists: " + str(document.id) + " - " + document.corpusHash)
                success = True

        except Exception, ex:
            logger.info("Bad data:%s" % corpus)
            logger.exception("Failed to save the trained data: " + str(ex))
            transaction.rollback()
Example #3
0
    def untrain(self, corpus=""):
        logger = logging.getLogger("Trainer.untrain")
        success = False

        try:
            document = Document.getDocumentByCorpus(corpus)

            if document:
                categories = DocumentCategoryCounts.getCategoriesForDocument(document)
                features = self.__featuredExtractor.getFeatures(corpus)
                document.delete()

                for feature in features:
                    featureCount, _ = FeatureCounts.objects.get_or_create(featureName=feature)
                    counts = self._jsonDecoder.decode(featureCount.countData) if featureCount.countData else {}

                    for category in categories:
                        self.__decrementCategoryCount(counts, category)

                    featureCount.countData = self._jsonEncoder.encode(counts)
                    featureCount.save()

                # We keep an index of category document counts for faster classification later on
                catDocCountIndex = CategoryDocumentCountIndex.getCountIndex()
                index = self._jsonDecoder.decode(catDocCountIndex.countData) if catDocCountIndex.countData else {}
                for category in categories:
                    self.__decrementCategoryCount(index, category)
                catDocCountIndex.countData = self._jsonEncoder.encode(index)
                catDocCountIndex.save()

                success = True

                transaction.commit()

            else:
                logger.info("Document doesn't exist")
                success = True
        except Exception, ex:
            logger.exception("Failed to untrain the document: " + str(ex))
            transaction.rollback()