Exemple #1
0
def compute_quality_for_corpus(corpus_dir):
    truth = utils.read_classification_from_file(os.path.join(corpus_dir, '!truth.txt'))
    prediction = utils.read_classification_from_file(os.path.join(corpus_dir, '!prediction.txt'))
    mat = confmat.BinaryConfusionMatrix('SPAM', 'OK')
    mat.compute_from_dicts(truth, prediction)
    p = mat.as_dict()
    return quality_score(p['tp'], p['tn'], p['fp'], p['fn'])
Exemple #2
0
def compute_quality_for_corpus(corpus_dir):
    path = os.getcwd()
    os.chdir(corpus_dir)
    truth_dict = read_classification_from_file('!truth.txt')
    pred_dict = read_classification_from_file('!prediction.txt')
    os.chdir(path)
    cm = BinaryConfusionMatrix(pos_tag='SPAM', neg_tag='OK')
    cm.compute_from_dicts(truth_dict, pred_dict)
    return quality_score(**cm.as_dict())
Exemple #3
0
def compute_quality_for_corpus(corpus_dir):
    truth = utils.read_classification_from_file(corpus_dir + os.path.sep +
                                                "!truth.txt")
    predicted = utils.read_classification_from_file(corpus_dir + os.path.sep +
                                                    "!prediction.txt")
    predMat = confmat.BinaryConfusionMatrix("SPAM", "OK")
    predMat.compute_from_dicts(truth, predicted)
    confusion_dict = predMat.as_dict()
    return quality_score(confusion_dict["tp"], confusion_dict["tn"],
                         confusion_dict["fp"], confusion_dict["fn"])
def compute_quality_for_corpus(corpus_dir):
	""" Compute quality_score() for predictions in corpus """
	matrix = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK")

	matrix.compute_from_dicts(
		dict(read_classification_from_file(os.path.join(corpus_dir, "!truth.txt"))),
		dict(read_classification_from_file(os.path.join(corpus_dir, "!prediction.txt")))
	)

	return quality_score(**matrix.as_dict())
Exemple #5
0
def compute_quality_for_corpus(corpus_dir):
    truth_dict = utils.read_classification_from_file(
        os.path.join(corpus_dir, '!truth.txt'))
    prediction_dict = utils.read_classification_from_file(
        os.path.join(corpus_dir, '!prediction.txt'))
    confusion_matrix = BinaryConfusionMatrix(pos_tag, neg_tag)
    confusion_matrix.compute_from_dicts(truth_dict, prediction_dict)

    conf_dict = confusion_matrix.as_dict()
    return quality_score(**conf_dict)
Exemple #6
0
def compute_quality_for_corpus(corpus_dir):
    truth_file_path = corpus_dir + '/!truth.txt'
    prediction_file_path = corpus_dir + '/!prediction.txt'
    truth_dict = utils.read_classification_from_file(truth_file_path)
    pred_dict = utils.read_classification_from_file(prediction_file_path)
    confusion_matrix = compute_confusion_matrix(truth_dict, pred_dict)
    tp = getattr(confusion_matrix, 'tp')
    tn = getattr(confusion_matrix, 'tn')
    fp = getattr(confusion_matrix, 'fp')
    fn = getattr(confusion_matrix, 'fn')
    quality = quality_score(tp, tn, fp, fn)
    return quality
Exemple #7
0
def compute_quality_for_corpus(corpus_dir):
    """
    Computes quality for given filter based on its prediction for given data
    :param corpus_dir: directory with emails and prediction
    :return:
    """
    truth_dict = utils.read_classification_from_file(os.path.join(corpus_dir, '!truth.txt'))  # read truth
    pred_dict = utils.read_classification_from_file(os.path.join(corpus_dir, '!prediction.txt'))  # read prediction
    bcm = confmat.BinaryConfusionMatrix("SPAM", "OK")  # create bin confusion matrix where SPAM is true and HAM is neg
    bcm.compute_from_dicts(truth_dict, pred_dict)  # compute from dictionaries
    dic = bcm.as_dict()
    return quality_score(dic['tp'], dic['tn'], dic['fp'], dic['fn'])  # get spam filter score
Exemple #8
0
def compute_quality_for_corpus(corpus_dir,
                               fn_weight=1,
                               fp_weight=10,
                               pos_tag="SPAM",
                               neg_tag="OK"):
    matrix = confmat.BinaryConfusionMatrix(pos_tag, neg_tag)
    truth_dict = read_classification_from_file(corpus_dir + "/!truth.txt")
    pred_dict = read_classification_from_file(corpus_dir + "/!prediction.txt")
    matrix.compute_from_dicts(truth_dict, pred_dict)
    print(
        f"Fp: {matrix.fp}\nFn: {matrix.fn}\nTp: {matrix.tp}\nTn: {matrix.tn}")
    return (matrix.tp + matrix.tn) / (
        fn_weight * matrix.fn + fp_weight * matrix.fp + matrix.tp + matrix.tn)
Exemple #9
0
def compute_quality_for_corpus(corpus_dir):
    """ Compute quality_score() for predictions in corpus """
    matrix = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK")

    matrix.compute_from_dicts(
        dict(
            read_classification_from_file(
                os.path.join(corpus_dir, "!truth.txt"))),
        dict(
            read_classification_from_file(
                os.path.join(corpus_dir, "!prediction.txt"))))

    return quality_score(**matrix.as_dict())
Exemple #10
0
def compute_quality_for_corpus(corpus_dir):
    '''
    Calculates quality for corpus
    :param corpus_dir: path to corpus
    :return: Number (0 - 1) representing quality
    '''

    truth = utils.read_classification_from_file(corpus_dir + "/" + TRUTH_FILE)
    prediction = utils.read_classification_from_file(corpus_dir + "/" +
                                                     PREDICTION_FILE)

    bcm = confmat.BinaryConfusionMatrix(pos_tag=SPAM_TAG, neg_tag=HAM_TAG)
    bcm.compute_from_dicts(truth_dict=truth, pred_dict=prediction)

    return quality_score(bcm.tp, bcm.tn, bcm.fp, bcm.fn)
Exemple #11
0
    def train(self, dir):
        classification = utils.read_classification_from_file(dir +
                                                             "/!truth.txt")
        spam_total = 0
        ham_total = 0
        file_name_with_data = dict()

        for filename in os.listdir(dir):
            if filename[0] == "!": continue
            f = open(dir + "/" + filename, 'r', encoding="utf8")
            file_name_with_data.update({filename: f.read()})

        for file_name, email_content in file_name_with_data.items():
            cls = classification[file_name]

            if cls == "SPAM":
                spam_total += 1
            else:
                ham_total += 1

            for word in set(self.get_tokens(email_content)):
                if cls == "SPAM":
                    self.spams[word] += 1
                else:
                    self.hams[word] += 1

        spam_probability = spam_total / (spam_total + ham_total)
        ham_probability = 1 - spam_probability

        for word in (set(self.spams.keys()) | set(self.hams.keys())):
            self.spamicity[word] = (self.spams[word] / spam_total * spam_probability) / \
                                   (self.spams[word] / spam_total * spam_probability + self.hams[
                                       word] / ham_total * ham_probability)
 def is_tag(self, ename, whichtag):
     rel_path = self.path_to_mails + '/'
     is_ham = read_classification_from_file(rel_path + TRUTHFILE)
     if (is_ham[ename] == whichtag):
         return True
     else:
         return False
Exemple #13
0
def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score
Exemple #14
0
    def train(
        self,
        file_path,
        batch_size=10,
        learning_rate=0.1,
        lr_decay=0.05,
        epochs=1000,
        momentum=0.0,
        tuning=False
    ):  # analogous to PLR_filter, tuning parameter plots out a graph of mean loss over epochs
        if tuning:
            og_lr = learning_rate  # original learning rate
            x_plt = []  # x-axis (epochs)
            y_plt = []  # y-axis (mean loss)
        corpus = Corpus(file_path)
        truth_dict = utils.read_classification_from_file(file_path +
                                                         "/!truth.txt")
        got_data = True
        mails_getter = corpus.emails()
        batches = []
        while got_data:
            batch = []
            for i in range(batch_size):
                try:
                    email = next(mails_getter)
                    batch.append(
                        (email[1],
                         1 if truth_dict[email[0]] == self.pos_tag else 0))
                except StopIteration:
                    got_data = False
                    break
            batches.append(batch)

        for e in range(epochs):
            if tuning:
                steps = 0
            print(learning_rate)
            self.init_momentums()
            loss = 0
            for batch in batches:
                batch_vectors = [(m[0].get_feature_vector_lr()) for m in batch]
                y = [m[1] for m in batch]
                loss += self.gradient_descent(y, batch_vectors, learning_rate,
                                              momentum)
                if tuning:
                    steps += 1
                print(f"trained on epoch #{e +1}")
            learning_rate *= 1 / (1 + lr_decay * e)
            if tuning:
                y_plt.append(loss / steps)
                x_plt.append(e)
        if tuning:
            plt.plot(x_plt, y_plt)
            plt.title(
                f"lr:{og_lr} lrd:{lr_decay} bs:{batch_size} m: {momentum} e:{epochs}"
            )
            plt.xlabel("epochs")
            plt.ylabel("mean loss")
            plt.show()
Exemple #15
0
 def count_spams(email_adress):
     truth = read_classification_from_file(str(email_adress +
                                               '/!truth.txt'))
     count_spam = 0
     for email in truth:
         if truth[email] == 'SPAM':
             count_spam += 1
     return count_spam
Exemple #16
0
 def is_ham(self, email_soubor):
     truth = read_classification_from_file(
         str(self.email_adress + '/!truth.txt'))
     for email in truth:
         if email == email_soubor and truth[email] == 'SPAM':
             return False
         if email == email_soubor and truth[email] == 'OK':
             return True
Exemple #17
0
def compute_quality_for_corpus(corpus_dir):
    truth_clasf = read_classification_from_file(corpus_dir + '/' + TRUTHFILE)
    pred_clasf = read_classification_from_file(corpus_dir + '/' + PREDFILE)

    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth_clasf, pred_clasf)

    matrix_dict = conf_matrix.as_dict()
    # Testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])

    return score
 def assertPredictionFileExistsAndContainsClassificationFor(self, expected):
     fpath = os.path.join(CORPUS_DIR, PREDICTION_FILENAME)
     self.assertTrue(os.path.isfile(fpath),
         "The test() method did not create the !prediction.txt file.")
     observed = read_classification_from_file(fpath)
     self.assertEqual(
         sorted(expected.keys()), sorted(observed.keys()),
         'The !prediction.txt file does not contain decisions for the files it should.')
     self.assertTrue(
         all(value in (SPAM_TAG,HAM_TAG) for value in observed.values()))
Exemple #19
0
 def assertPredictionFileExistsAndContainsClassificationFor(self, expected):
     fpath = os.path.join(CORPUS_DIR, PREDICTION_FILENAME)
     self.assertTrue(os.path.isfile(fpath),
         "The test() method did not create the !prediction.txt file.")
     observed = read_classification_from_file(fpath)
     self.assertEqual(
         sorted(expected.keys()), sorted(observed.keys()),
         'The !prediction.txt file does not contain decisions for the files it should.')
     self.assertTrue(
         all(value in (SPAM_TAG,HAM_TAG) for value in observed.values()))
    def test_correctlyFormattedFile(self):
        # Prepare fixture
        expected = create_classification()
        save_classification_to_file(expected, FILENAME)

        # Exercise the SUT
        observed = read_classification_from_file(FILENAME)

        # Validate results
        self.assertDictEqual(
            expected, observed,
            'The read file contents are not equal to the expected contents.')
    def test_returnEmptyDict_forEmptyFile(self):
        # Prepare fixture
        expected = dict()
        save_classification_to_file(expected, FILENAME)

        # Excercise the SUT
        observed = read_classification_from_file(FILENAME)

        # Validate results
        self.assertDictEqual(
            expected, observed,
            'The read dictionary shall be empty for empty file.')
Exemple #22
0
    def test_correctlyFormattedFile(self):
        # Prepare fixture
        expected = create_classification()
        save_classification_to_file(expected, FILENAME)

        # Exercise the SUT
        with replaced_open():
            observed = read_classification_from_file(FILENAME)

        # Validate results
        self.assertDictEqual(
            expected, observed,
            'The read file contents are not equal to the expected contents.')
Exemple #23
0
def compute_quality_for_corpus(corpus_dir):
    dirs = os.listdir(corpus_dir)

    for file in dirs:
        if '!' in file:
            if file == "!truth.txt":
                truth_dict = read_classification_from_file(corpus_dir + '/' +
                                                           file)
            elif file == "!prediction.txt":
                pred_dict = read_classification_from_file(corpus_dir + '/' +
                                                          file)
            else:
                truth_dict = None
                pred_dict = None
        else:
            pass

    cm1 = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK")
    cm1.compute_from_dicts(truth_dict, pred_dict)
    final_dict = cm1.as_dict()
    return quality_score(final_dict['tp'], final_dict['tn'], final_dict['fp'],
                         final_dict['fn'])
Exemple #24
0
 def train(self, train_corpus_dir):
     """
     Train the corpus on given emails dataset
     :param train_corpus_dir:
     """
     self.truth = utils.read_classification_from_file(
         train_corpus_dir)  # load truth
     train_corpus = Corpus(train_corpus_dir)
     self.get_SPAM_percentage(train_corpus)  # not in use now
     for val in email_keys:  # get values for parts of email header
         self.classify_part(train_corpus, val.lower())
     self.classify_payload(
         train_corpus)  # get values for email payload/body
Exemple #25
0
    def test_returnEmptyDict_forEmptyFile(self):
        # Prepare fixture
        expected = dict()
        save_classification_to_file(expected, FILENAME)

        # Excercise the SUT
        with replaced_open():  # Insist on explicit use of encoding
            observed = read_classification_from_file(FILENAME)

        # Validate results
        self.assertDictEqual(
            expected, observed,
            'The read dictionary shall be empty for empty file.')
Exemple #26
0
 def train(self,
           file_path,
           batch_size=10,
           learning_rate=0.1,
           lr_decay=0.05,
           epochs=1000,
           momentum=0.0):
     corpus = Corpus(file_path)
     truth_dict = utils.read_classification_from_file(file_path +
                                                      "/!truth.txt")
     got_data = True
     mails_getter = corpus.emails()
     batches = []
     # loads all data from directory in batches of given size
     while got_data:
         batch = []
         # loads a batch of given size, a smaller one if out of data
         for i in range(batch_size):
             try:
                 email = next(mails_getter)
                 batch.append(
                     (email[1],
                      1 if truth_dict[email[0]] == self.pos_tag else 0))
             except StopIteration:
                 got_data = False
                 break
         batches.append(batch)
     for e in range(epochs):  # trains multiple times on all batches
         self.init_momentums()
         for batch in batches:  # performs gradient descent on each bach
             # gets feature vectors for batch
             feature_vectors = [
                 (m[0].get_feature_vector_plr()) for m in batch
             ]  # gets feature vectors of the batch
             y = [m[1] for m in batch]  # gets the truth vector of the batch
             for i in range(
                     self.subvector_count
             ):  # weights for each subvector are trained separately
                 subvector_batch = [
                     v[i] for v in feature_vectors
                 ]  # isolates a subvector from all vectors
                 self.gradient_descent(i, y, subvector_batch, learning_rate,
                                       momentum)
             print(f"trained on epoch #{e +1}")
         learning_rate *= 1 / (1 + lr_decay * e)
Exemple #27
0
    def train(self, path):
        """
        Trains the corpus on given emails dataset
        :param path: directory with emails
        """
        self.truth_dict = read_classification_from_file(path + "/!truth.txt")
        emails = TrainingCorpus(path)

        spam_words, num_of_spam_emails = self.list_spam_ham_words(emails, True)
        ham_words, num_of_ham_emails = self.list_spam_ham_words(emails, False)

        self.portion_of_spam_emails = num_of_spam_emails / (
            num_of_spam_emails + num_of_ham_emails)
        self.all_words = Counter(join_spam_and_ham_words(
            spam_words, ham_words))

        self.num_of_spam_words = len(spam_words)
        self.num_of_ham_words = len(ham_words)
        self.num_of_all_words = len(self.all_words)
Exemple #28
0
    def train(self, file_path):
        self.content_spam_dict = {}
        self.content_ham_dict = {}
        class_dict = utils.read_classification_from_file(file_path +
                                                         '/!truth.txt')
        corpus = Corpus(file_path)
        email_generator = corpus.emails()
        content_counter_spam = Counter()
        content_counter_ham = Counter()
        content_wordcount_spam = 0
        content_wordcount_ham = 0

        spam_count = 0
        ham_count = 0
        every_word_content = set()

        for mail in email_generator:
            content_words = self.string_to_words(mail[1].content_no_html)
            content_counter = Counter(content_words)
            for word in content_words:
                every_word_content.add(word)

            if class_dict[mail[0]] == self.pos_tag:
                spam_count += 1
                content_counter_spam += content_counter
                content_wordcount_spam += len(content_words)
            else:
                ham_count += 1
                content_counter_ham += content_counter
                content_wordcount_ham += len(content_words)

        for word in every_word_content:
            content_counter_ham[word] += 1
            content_counter_spam[word] += 1
            self.content_spam_dict[
                word] = content_counter_spam[word] / content_wordcount_spam
            self.content_ham_dict[
                word] = content_counter_ham[word] / content_wordcount_ham

        self.spam_probability = spam_count / (spam_count + ham_count)
        self.ham_probability = ham_count / (spam_count + ham_count)
        self.trained = True
Exemple #29
0
    def train(self, train_dir):
        self.train_files_dict = read_classification_from_file(train_dir +
                                                              '/!truth.txt')
        total_emails = len(self.train_files_dict)

        for file in self.train_files_dict:
            train_file_path = train_dir
            train_file_path += '/' + file
            mail = self.get_email(train_file_path)
            mail_words = self.get_email_message(mail)
            mail_unique_words = set(mail_words)
            """Counting spam and ham word appearances"""
            if self.train_files_dict[file] == self.decision_table[1]:
                self.spam_words_counter.update(mail_words)
                self.total_spam_emails += 1
            else:
                self.ham_words_counter.update(mail_words)

            self.words_counter.update(mail_words)
            self.vocabulary.update(mail_unique_words)

        self.total_ham_emails = total_emails - self.total_spam_emails
        """Computing the probability that a message containing a given word is spam."""
        for word in self.vocabulary:
            if self.ham_words_counter.get(
                    word, 0) == 0 and self.spam_words_counter.get(word, 0) > 0:
                self.word_spaminess[word] = 0.99
            elif self.ham_words_counter.get(
                    word, 0) > 0 and self.spam_words_counter.get(word, 0) == 0:
                self.word_spaminess[word] = 0.01
            else:
                spam_likelihood = self.spam_words_counter.get(
                    word, 0) / self.total_spam_emails
                ham_likelihood = self.ham_words_counter.get(
                    word, 0) / self.total_ham_emails
                self.word_spaminess[word] = max(
                    spam_likelihood / (spam_likelihood + ham_likelihood), 0.01)
Exemple #30
0
 def get_class(self, filename):
     dic = read_classification_from_file(self.path + TRUTH)
     return dic[filename]
 def __init__(self, path_to_train):
     self.path_to_train = path_to_train
     self.path_to_truth = os.path.join(path_to_train, '!truth.txt')
     self.truth_dic = utils.read_classification_from_file(self.path_to_truth)
 def train(self, training_corpus_path):
     self.dictionary = read_classification_from_file(training_corpus_path +
                                                     '/!truth.txt')
     self.dictionary.fromkeys(self.dictionary, self.table[1])
 def train(self, training_corpus_path):
     self.dictionary = read_classification_from_file(training_corpus_path +
                                                     '/!truth.txt')
     self.dictionary = {x: choice(self.table) for x in self.dictionary}
Exemple #34
0
 def __init__(self, path):
     Corpus.__init__(self, path)
     self.path = path
     self.truth_dict = read_classification_from_file(self.path +
                                                     "/!truth.txt")
 def get_class(self, filename):
     dic = read_classification_from_file(self.path+TRUTH)
     return dic[filename]
Exemple #36
0
 def train(self, path_to_training_corpus):
     self.trained_data_dict = utils.read_classification_from_file(path_to_training_corpus)
Exemple #37
0
 def get_truth_class(self): 
     if (not os.path.isfile(os.path.join(self.path,TRUTH))):
         raise FileExistsError("File " + os.path.join(self.path,TRUTH) + " does not exists!")
     emails_class = utils.read_classification_from_file(os.path.join(self.path,TRUTH))
     for name, body in self.emails():
         yield (body,emails_class[name])
Exemple #38
0
                        f.write(part + " " + key + " " +
                                str(self.classification[part][key]) + "\n")


def convert(file, out):
    """
    I used this to convert the truth file for emails I found on internet, where the SPAM or HAM was before file name
    :param file:
    :param out:
    """
    dic = {}
    with open(file, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            key, val = line.split()
            dic[val] = key
    with open(out, 'w', encoding="utf-8") as f:
        for key in dic:
            f.write(key + " " + dic[key] + "\n")


if __name__ == "__main__":
    # used for testing and debugging
    filter = MyFilter()
    filter.test("SPAM-data/2/")
    confusion_matrix = BinaryConfusionMatrix('SPAM', 'OK')
    confusion_matrix.compute_from_dicts(
        utils.read_classification_from_file("SPAM-data/2/"),
        filter.predictions)
    print("Quality: %.2f%%" % (confusion_matrix.quality_score() * 100))
    utils.clean_up("SPAM-data/2/")  # clean !truth