def __init__(self, cache_dir="./.classifiers-data/naive-bayes", missing_value_prob=0.0001): self.cache_dir = cache_dir if not os.path.isdir(cache_dir): os.makedirs(self.cache_dir) self.preprocessor = ChainedPreprocessor([HtmlPreprocessor(),WordSplitterPreprocessor(),StemmerPreprocessor(), WordDeduper(), NumeralRemover(), StopWordRemovePreprocessor()]) self.labels_metadata_fname = self.cache_dir + "/labels.metadata" self.labels_metadata = utils.read_cached(self.labels_metadata_fname, lambda:{}) self.missing_prob = missing_value_prob
def retrain_label(self, label): """ Retrain using the existing data. for this label """ if not label in self.labels_metadata: return docids = [].extend(self.labels_metadata[label]["doc_ids"]) self.__clear_label(label) for doc_id in doc_ids: words = utils.read_cached(self.cache_dir + "/" + docid , []) self.__train_with_words(doc_id,words, label)
def classify(self, url, content): words = utils.read_cached(self.cache_dir + "/" + utils.get_hash(url) ,self.preprocessor.process, content) argmax = {"label":None, "prob":None} probs = {} for label,label_data in self.labels_metadata.iteritems(): prob_sum=0.0 for word in words: tot_doc_count = len(label_data["doc_ids"]) if word in label_data["word_doc_counts"]: prob = math.log(float(label_data["word_doc_counts"][word])/tot_doc_count) logger.debug("Found word: %s. Prob %f. Count in the %s data: %d" % (word,prob, label, label_data["word_doc_counts"][word])) prob_sum = prob_sum + prob else: prob_sum = prob_sum + math.log(self.missing_prob) probs[label] = prob_sum if not argmax["prob"] or prob_sum > argmax["prob"]: argmax["prob"] = prob_sum argmax["label"] = label logger.debug("URl: %s, debuginfo: %s " % (url,str(probs))) return argmax
def train(self, source_url, content, label): docid = utils.get_hash(source_url) words = utils.read_cached(self.cache_dir + "/" + docid ,self.preprocessor.process, content) self.__train_with_words(docid, words, label)