Example #1
0
 def train(self, spam_dir, ham_dir):
     self.model = NTF_Model()
     self.model.set_features(self.features)
     N = 0
     loss = 0.
     for f in get_files(spam_dir):
         print N
         N += 1
         self.model.observe_example(self.munge(f), 1)
     for f in get_files(ham_dir):
         print N
         N += 1
         self.model.observe_example(self.munge(f), 0)
     self.model.build_network()
     print "finished training"
Example #2
0
class NB_NTF(NaiveBayesModel):

    def classify(self, example, cost_ratio):
        log_likelihood1 = math.log(self.model.base_param)
        log_likelihood2 = math.log(1 - self.model.base_param)
        for i, token in enumerate(self.model.attribute_params):
            log_likelihood1 += (math.log(self.model.attribute_params[i][0]) -
                                example[i] / self.model.attribute_params[i][0])
            log_likelihood2 += (math.log(self.model.attribute_params[i][1]) -
                                example[i] / self.model.attribute_params[i][1])
        return int(log_likelihood1 - math.log(cost_ratio) > log_likelihood2)

    def train(self, spam_dir, ham_dir):
        self.model = NTF_Model()
        self.model.set_features(self.features)
        N = 0
        loss = 0.
        for f in get_files(spam_dir):
            print N
            N += 1
            self.model.observe_example(self.munge(f), 1)
        for f in get_files(ham_dir):
            print N
            N += 1
            self.model.observe_example(self.munge(f), 0)
        self.model.build_network()
        print "finished training"

    def munge(self, email_file):
        f = open(email_file, 'rb')
        text = f.read()
        word_list = re.split('\W+', text)
        num_words = len(word_list)
        ntf_vector = [float(word_list.count(token)) /
                      num_words for token in self.features]
        return ntf_vector