Example #1
0
class Feature_Chooser:
    def __init__(self):
        self.features = []
        self.model = Boolean_Model()
        self.threshold = .03

    def choose(self, spam_dir, ham_dir):
        i = 0
        for f in get_files(spam_dir):
            i += 1
            for word in munge_All_Words(f):
                if word not in self.features:
                    self.features.append(word)
        j = 0
        for f in get_files(ham_dir):
            j += 1
            for word in munge_All_Words(f):
                if word not in self.features:
                    self.features.append(word)
        print len(self.features)
        self.model.set_features(self.features)
        print "finished choosing features"

    def train(self, spam_dir, ham_dir):
        N = 0
        loss = 0.
        for f in get_files(spam_dir):
            N += 1
            if N % 23 == 0:
                print N
                self.model.observe_example(self.munge(f), 1)
        for f in get_files(ham_dir):
            N += 1
            print N
            if N % 23 == 0:
                print N
                self.model.observe_example(self.munge(f), 0)
        self.model.build_network()
        new_features = []
        for i, attribute in enumerate(self.model.attribute_params):
            print attribute, attribute[1] - attribute[0]
            if abs(attribute[1] - attribute[0]) > self.threshold:
                new_features.append(self.features[i])
        print new_features
        self.features = new_features
        print "finished training"

    def munge(self, email_file):
        f = open(email_file, 'rb')
        text = f.read()
        word_list = re.split('\W+', text)
        boolean_vector = [int(token in word_list) for token in self.features]
        return boolean_vector

    def pickle(self, features_file):
        output = open(features_file, 'wb')
        pickle.dump(self.features, output)
        print self.features
        print "pickled"
Example #2
0
 def train(self, spam_dir, ham_dir):
     self.model = Boolean_Model()
     self.model.set_features(self.features)
     N = 0
     loss = 0.
     for f in get_files(spam_dir):
         print N
         N += 1
         self.model.observe_example(self.munge(f), 1)
     for f in get_files(ham_dir):
         print N
         N += 1
         self.model.observe_example(self.munge(f), 0)
     self.model.build_network()
     print "finished training"
Example #3
0
class NB_Boolean(NaiveBayesModel):

    def classify(self, example, cost_ratio):
        log_likelihood1 = math.log(self.model.base_param)
        log_likelihood2 = math.log(1 - self.model.base_param)
        for i, token in enumerate(self.model.attribute_params):
            if example[i] == 1:
                log_likelihood1 += math.log(self.model.attribute_params[i][0])
                log_likelihood2 += math.log(self.model.attribute_params[i][1])
            else:
                log_likelihood1 += math.log(1 -
                                            self.model.attribute_params[i][0])
                log_likelihood2 += math.log(1 -
                                            self.model.attribute_params[i][1])
        return int(log_likelihood1 - math.log(cost_ratio) > log_likelihood2)

    def train(self, spam_dir, ham_dir):
        self.model = Boolean_Model()
        self.model.set_features(self.features)
        N = 0
        loss = 0.
        for f in get_files(spam_dir):
            print N
            N += 1
            self.model.observe_example(self.munge(f), 1)
        for f in get_files(ham_dir):
            print N
            N += 1
            self.model.observe_example(self.munge(f), 0)
        self.model.build_network()
        print "finished training"

    def munge(self, email_file):
        f = open(email_file, 'rb')
        text = f.read()
        word_list = re.split('\W+', text)
        boolean_vector = [int(token in word_list) for token in self.features]
        return boolean_vector
Example #4
0
 def __init__(self):
     self.features = []
     self.model = Boolean_Model()
     self.threshold = .03