Ejemplo n.º 1
0
def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score
Ejemplo n.º 2
0
def compute_quality_for_corpus(corpus_dir):
    path = os.getcwd()
    os.chdir(corpus_dir)
    truth_dict = read_classification_from_file('!truth.txt')
    pred_dict = read_classification_from_file('!prediction.txt')
    os.chdir(path)
    cm = BinaryConfusionMatrix(pos_tag='SPAM', neg_tag='OK')
    cm.compute_from_dicts(truth_dict, pred_dict)
    return quality_score(**cm.as_dict())
Ejemplo n.º 3
0
def compute_quality_for_corpus(corpus_dir):
	""" Compute quality_score() for predictions in corpus """
	matrix = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK")

	matrix.compute_from_dicts(
		dict(read_classification_from_file(os.path.join(corpus_dir, "!truth.txt"))),
		dict(read_classification_from_file(os.path.join(corpus_dir, "!prediction.txt")))
	)

	return quality_score(**matrix.as_dict())
Ejemplo n.º 4
0
def compute_quality_for_corpus(corpus_dir):
    truth_dict = utils.read_classification_from_file(
        os.path.join(corpus_dir, '!truth.txt'))
    prediction_dict = utils.read_classification_from_file(
        os.path.join(corpus_dir, '!prediction.txt'))
    confusion_matrix = BinaryConfusionMatrix(pos_tag, neg_tag)
    confusion_matrix.compute_from_dicts(truth_dict, prediction_dict)

    conf_dict = confusion_matrix.as_dict()
    return quality_score(**conf_dict)
Ejemplo n.º 5
0
def compute_quality_for_corpus(corpus_dir):
        truth_dic = methods.read_classification_from_file(methods.add_slash(corpus_dir)+"!truth.txt")
        pred_dic = methods.read_classification_from_file(methods.add_slash(corpus_dir)+"!prediction.txt")
        bc1 = BinaryConfusionMatrix('SPAM', 'OK')
        bc1.compute_from_dicts(truth_dic, pred_dic)
        dict_score = bc1.as_dict()
        fn=dict_score['fn']
        tn=dict_score['tn']
        fp=dict_score['fp']
        tp=dict_score['tp']
        return quality_score(tp, tn, fp, fn), tp, tn, fp, fn
Ejemplo n.º 6
0
def compute_quality_for_corpus(corpus_dir):
    """ Compute quality_score() for predictions in corpus """
    matrix = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK")

    matrix.compute_from_dicts(
        dict(
            read_classification_from_file(
                os.path.join(corpus_dir, "!truth.txt"))),
        dict(
            read_classification_from_file(
                os.path.join(corpus_dir, "!prediction.txt"))))

    return quality_score(**matrix.as_dict())
Ejemplo n.º 7
0
def compute_quality_for_corpus(corpus_dir):
    truth_clasf = read_classification_from_file(corpus_dir + '/' + TRUTHFILE)
    pred_clasf = read_classification_from_file(corpus_dir + '/' + PREDFILE)

    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth_clasf, pred_clasf)

    matrix_dict = conf_matrix.as_dict()
    # Testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])

    return score
Ejemplo n.º 8
0
def compute_quality_for_corpus(corpus_dir):
    dirs = os.listdir(corpus_dir)

    for file in dirs:
        if '!' in file:
            if file == "!truth.txt":
                truth_dict = read_classification_from_file(corpus_dir + '/' +
                                                           file)
            elif file == "!prediction.txt":
                pred_dict = read_classification_from_file(corpus_dir + '/' +
                                                          file)
            else:
                truth_dict = None
                pred_dict = None
        else:
            pass

    cm1 = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK")
    cm1.compute_from_dicts(truth_dict, pred_dict)
    final_dict = cm1.as_dict()
    return quality_score(final_dict['tp'], final_dict['tn'], final_dict['fp'],
                         final_dict['fn'])
Ejemplo n.º 9
0
def compute_quality_for_corpus(corpus_dir):
    '''Return the quality score for tested corpus (with truth and prediction files).'''
    from utils import read_classification_from_file as load_as_dict
    truth_file = '!truth.txt'
    pred_file = '!prediction.txt'
    truth_dict = load_as_dict(os.path.join(corpus_dir, truth_file))
    pred_dict = load_as_dict(os.path.join(corpus_dir, pred_file))
    
    from confmat import BinaryConfusionMatrix
    pos_tag = 'SPAM'
    neg_tag = 'OK'
    cm = BinaryConfusionMatrix(pos_tag, neg_tag)
    
    cm.compute_from_dicts(truth_dict, pred_dict)
    
    confusion_dict = cm.as_dict()
    tp = confusion_dict['tp']
    tn = confusion_dict['tn']
    fp = confusion_dict['fp']
    fn = confusion_dict['fn']
    
    return quality_score(tp, tn, fp, fn)
Ejemplo n.º 10
0
 def setUp(self):
     # Prepare fixture
     self.cm = BinaryConfusionMatrix(pos_tag=INI_SPAM_TAG,
                                     neg_tag=INI_HAM_TAG)
Ejemplo n.º 11
0
class BinaryConfusionMatrixTest(unittest.TestCase):
    def setUp(self):
        # Prepare fixture
        self.cm = BinaryConfusionMatrix(pos_tag=INI_SPAM_TAG,
                                        neg_tag=INI_HAM_TAG)

    def test_countersAreZero_afterCreation(self):
        # Exercise the SUT
        cmdict = self.cm.as_dict()
        # Assert
        self.assertDictEqual(cmdict, {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0})

    def test_updatesTPcorrectly(self):
        # Exercise the SUT
        self.cm.update(SPAM_TAG, SPAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(), {
            'tp': 1,
            'tn': 0,
            'fp': 0,
            'fn': 0
        })

    def test_updatesTNcorrectly(self):
        # Exercise the SUT
        self.cm.update(HAM_TAG, HAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(), {
            'tp': 0,
            'tn': 1,
            'fp': 0,
            'fn': 0
        })

    def test_updatesFPcorrectly(self):
        # Exercise the SUT
        self.cm.update(HAM_TAG, SPAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(), {
            'tp': 0,
            'tn': 0,
            'fp': 1,
            'fn': 0
        })

    def test_updatesFNcorrectly(self):
        # Exercise the SUT
        self.cm.update(SPAM_TAG, HAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(), {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 1
        })

    def test_update_raisesValueError_forWrongTruthValue(self):
        # This test may be ignored (deleted).
        # It tests an additional feature of the BCF class.

        # Assert and exercise the SUT
        with self.assertRaises(ValueError):
            self.cm.update('a bad value', SPAM_TAG)

    def test_update_raisesValueError_forWrongPredictionValue(self):
        # This test may be ignored (deleted).
        # It tests an additional feature of the BCF class.

        # Assert and exercise the SUT
        with self.assertRaises(ValueError):
            self.cm.update(SPAM_TAG, 'a bad value')

    def test_computeFromDicts_allCasesOnce(self):
        # Prepare fixture
        truth = {1: SPAM_TAG, 2: SPAM_TAG, 3: HAM_TAG, 4: HAM_TAG}
        prediction = {1: SPAM_TAG, 2: HAM_TAG, 3: SPAM_TAG, 4: HAM_TAG}
        # Excercise the SUT
        self.cm.compute_from_dicts(truth, prediction)
        # Assert
        self.assertDictEqual(self.cm.as_dict(), {
            'tp': 1,
            'tn': 1,
            'fp': 1,
            'fn': 1
        })
Ejemplo n.º 12
0
 def setUp(self):
     # Prepare fixture
     self.cm = BinaryConfusionMatrix(pos_tag=SPAM_TAG, neg_tag=HAM_TAG)
Ejemplo n.º 13
0
class BinaryConfusionMatrixTest(unittest.TestCase):
 
    def setUp(self):
        # Prepare fixture
        self.cm = BinaryConfusionMatrix(pos_tag=SPAM_TAG, neg_tag=HAM_TAG)
 
    def test_countersAreZero_afterCreation(self):
        # Exercise the SUT
        cmdict = self.cm.as_dict()
        # Assert
        self.assertDictEqual(cmdict, {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0})

    def test_updatesTPcorrectly(self):
        # Exercise the SUT
        self.cm.update(SPAM_TAG, SPAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(),
                             {'tp': 1, 'tn': 0, 'fp': 0, 'fn': 0})
        
    def test_updatesTNcorrectly(self):
        # Exercise the SUT
        self.cm.update(HAM_TAG, HAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(),
                             {'tp': 0, 'tn': 1, 'fp': 0, 'fn': 0})  
        
    def test_updatesFPcorrectly(self):
        # Exercise the SUT
        self.cm.update(HAM_TAG, SPAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(),
                             {'tp': 0, 'tn': 0, 'fp': 1, 'fn': 0})  
        
    def test_updatesFNcorrectly(self):
        # Exercise the SUT
        self.cm.update(SPAM_TAG, HAM_TAG)
        # Assert
        self.assertDictEqual(self.cm.as_dict(),
                             {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 1})
        
    #def test_update_raisesValueError_forWrongTruthValue(self):
        # This test may be ignored (deleted). 
        # It tests an additional feature of the BCF class.
        
        # Assert and exercise the SUT
    #    with self.assertRaises(ValueError):
    #        self.cm.update('a bad value', SPAM_TAG)
 
    #def test_update_raisesValueError_forWrongPredictionValue(self):
        # This test may be ignored (deleted). 
        # It tests an additional feature of the BCF class.
        
        # Assert and exercise the SUT
    #    with self.assertRaises(ValueError):
    #        self.cm.update(SPAM_TAG, 'a bad value')
            
    def test_computeFromDicts_allCasesOnce(self):
        # Prepare fixture
        truth = {1: SPAM_TAG,
                 2: SPAM_TAG,
                 3: HAM_TAG,
                 4: HAM_TAG}
        prediction = {1: SPAM_TAG,
                      2: HAM_TAG,
                      3: SPAM_TAG,
                      4: HAM_TAG}
        # Excercise the SUT
        self.cm.compute_from_dicts(truth, prediction)
        # Assert
        self.assertDictEqual(self.cm.as_dict(),
                             {'tp': 1, 'tn': 1, 'fp': 1, 'fn': 1})    
Ejemplo n.º 14
0
                        f.write(part + " " + key + " " +
                                str(self.classification[part][key]) + "\n")


def convert(file, out):
    """
    I used this to convert the truth file for emails I found on internet, where the SPAM or HAM was before file name
    :param file:
    :param out:
    """
    dic = {}
    with open(file, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            key, val = line.split()
            dic[val] = key
    with open(out, 'w', encoding="utf-8") as f:
        for key in dic:
            f.write(key + " " + dic[key] + "\n")


if __name__ == "__main__":
    # used for testing and debugging
    filter = MyFilter()
    filter.test("SPAM-data/2/")
    confusion_matrix = BinaryConfusionMatrix('SPAM', 'OK')
    confusion_matrix.compute_from_dicts(
        utils.read_classification_from_file("SPAM-data/2/"),
        filter.predictions)
    print("Quality: %.2f%%" % (confusion_matrix.quality_score() * 100))
    utils.clean_up("SPAM-data/2/")  # clean !truth