def __init__(self): self.savedCategories = None self._corpus = "" self._features = [] self._classifierIndex = None self._mins = {} self.__featureExtractor = FeatureExtractor()
class Classifier(object): def __init__(self): self.savedCategories = None self._corpus = "" self._features = [] self._classifierIndex = None self._mins = {} self.__featureExtractor = FeatureExtractor() def _getCorpusShort(self): return self._corpus[:50] if self._corpus else "" def _getGroupedCategories(self, categories): groupedCategories = {} for category in categories : if not groupedCategories.get(category.categoryName) : groupedCategories[category.categoryName] = {} groupedCategories[category.categoryName][category.yes] = category return groupedCategories def setFeatureExtractor(self, featureExtractor): self.__featureExtractor = featureExtractor def setMinThreshold(self, categoryName, yes, value): if not self._mins.get(categoryName) : self._mins[categoryName] = {} self._mins[categoryName][yes] = value def getMinThreshold(self, categoryName, yes): return self._mins.get(categoryName, {}).get(yes, .6) ''' Classification of text corpus ''' def classify(self, corpus): self._corpus = corpus self._features = self.__featureExtractor.getFeatures(corpus) self._classifierIndex = ClassifierIndex(self._features) # Find the category with the highest probability logger = logging.getLogger("Classifier.classify") categories = ClassifierCategory.getAllCategories() groupedCategories = self._getGroupedCategories(categories) probableTags = [] try : probableTags = self._getProbableTags(groupedCategories) except Exception, ex : logger.exception("classification failure: " + str(ex)) return probableTags