def train(self, spam_dir, ham_dir): self.model = NTF_Model() self.model.set_features(self.features) N = 0 loss = 0. for f in get_files(spam_dir): print N N += 1 self.model.observe_example(self.munge(f), 1) for f in get_files(ham_dir): print N N += 1 self.model.observe_example(self.munge(f), 0) self.model.build_network() print "finished training"
class NB_NTF(NaiveBayesModel): def classify(self, example, cost_ratio): log_likelihood1 = math.log(self.model.base_param) log_likelihood2 = math.log(1 - self.model.base_param) for i, token in enumerate(self.model.attribute_params): log_likelihood1 += (math.log(self.model.attribute_params[i][0]) - example[i] / self.model.attribute_params[i][0]) log_likelihood2 += (math.log(self.model.attribute_params[i][1]) - example[i] / self.model.attribute_params[i][1]) return int(log_likelihood1 - math.log(cost_ratio) > log_likelihood2) def train(self, spam_dir, ham_dir): self.model = NTF_Model() self.model.set_features(self.features) N = 0 loss = 0. for f in get_files(spam_dir): print N N += 1 self.model.observe_example(self.munge(f), 1) for f in get_files(ham_dir): print N N += 1 self.model.observe_example(self.munge(f), 0) self.model.build_network() print "finished training" def munge(self, email_file): f = open(email_file, 'rb') text = f.read() word_list = re.split('\W+', text) num_words = len(word_list) ntf_vector = [float(word_list.count(token)) / num_words for token in self.features] return ntf_vector