def __init__(self, cache_dir="./.classifiers-data/naive-bayes", missing_value_prob=0.0001):
     self.cache_dir = cache_dir
     if not os.path.isdir(cache_dir):
         os.makedirs(self.cache_dir)
     self.preprocessor = ChainedPreprocessor([HtmlPreprocessor(),WordSplitterPreprocessor(),StemmerPreprocessor(), WordDeduper(), NumeralRemover(), StopWordRemovePreprocessor()])
     self.labels_metadata_fname = self.cache_dir + "/labels.metadata"
     self.labels_metadata = utils.read_cached(self.labels_metadata_fname, lambda:{})
     self.missing_prob = missing_value_prob
 def retrain_label(self, label):
     """
     Retrain using the existing data. for this label
     """
     if not label in self.labels_metadata:
         return
     docids = [].extend(self.labels_metadata[label]["doc_ids"])
     self.__clear_label(label)
     for doc_id in doc_ids:
         words = utils.read_cached(self.cache_dir + "/" + docid , [])
         self.__train_with_words(doc_id,words, label)
 def classify(self, url, content):
     words = utils.read_cached(self.cache_dir + "/" + utils.get_hash(url) ,self.preprocessor.process, content)
     argmax = {"label":None, "prob":None}
     probs = {}
     for label,label_data in self.labels_metadata.iteritems():
         prob_sum=0.0
         for word in words:
             tot_doc_count = len(label_data["doc_ids"])
             if word in label_data["word_doc_counts"]:
                 prob = math.log(float(label_data["word_doc_counts"][word])/tot_doc_count)
                 logger.debug("Found word: %s. Prob %f. Count in the %s data: %d" % (word,prob, label, label_data["word_doc_counts"][word]))
                 prob_sum = prob_sum + prob
             else:
                 prob_sum = prob_sum + math.log(self.missing_prob)
         probs[label] = prob_sum
         if not argmax["prob"] or prob_sum > argmax["prob"]:
             argmax["prob"] = prob_sum
             argmax["label"] = label
     logger.debug("URl: %s, debuginfo: %s " % (url,str(probs)))
     return argmax
 def train(self, source_url, content, label):
     docid = utils.get_hash(source_url)
     words = utils.read_cached(self.cache_dir + "/" + docid ,self.preprocessor.process, content)
     self.__train_with_words(docid, words, label)