Ejemplo n.º 1
0
def main():

    data_sets_dir = "C:\\Users\\Alex\\Downloads\\Data Sets"
    set_dirs = ["DictionarySets-1.1", "DictionarySets-1.2", "DictionarySets-2.1", "DictionarySets-2.2",
                "DictionarySets-3.1", "Mislabeled-Big", "Mislabeled-Both-1.1", "Mislabeled-Both-1.2",
                "Mislabeled-Both-2.1", "Mislabeled-Both-2.2", "Mislabeled-Both-3.1", "Mislabeled-HtoS-1.1",
                "Mislabeled-HtoS-1.2", "Mislabeled-HtoS-1.3", "Mislabeled-HtoS-1.4", "Mislabeled-HtoS-1.5",
                "Mislabeled-StoH-1.1", "Mislabeled-StoH-1.2", "Mislabeled-StoH-1.3", "Mislabeled-StoH-2.1",
                "Mislabeled-StoH-2.2"]

    hams = [seterize(data_sets_dir, set_dir, False, 3) for set_dir in set_dirs]
    spams = [seterize(data_sets_dir, set_dir, True, 3) for set_dir in set_dirs]

    assert(len(hams) == len(spams))
    sets = [0]

    for i in sets:
        ham = hams[i]
        spam = spams[i]

        au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]),
                                                  msgs.HamStream(ham[2], [ham[2]])],        # Training Ham
                                                 [msgs.SpamStream(spam[1], [spam[1]]),
                                                  msgs.SpamStream(spam[2], [spam[2]])],     # Training Spam
                                                 msgs.HamStream(ham[0], [ham[0]]),          # Testing Ham
                                                 msgs.SpamStream(spam[0], [spam[0]]),       # Testing Spam
                                                 )

        print "Cluster list:\n"
        outfile = open("C:\\Users\\Alex\\Desktop\\cluster_au.txt", 'w')
        cluster_list = ActiveUnlearnDriver.cluster_au(au, gold=True, test=True)
        print cluster_list
        outfile.write(cluster_list)
        outfile.close()
Ejemplo n.º 2
0
def main():
    num_data_sets = len(hams)
    assert(len(hams) == len(spams))
    sets = [0]

    for i in sets:
        ham = hams[i]
        spam = spams[i]

        ham_test = ham[0]
        spam_test = spam[0]

        ham_train = ham[1]
        spam_train = spam[1]

        ham_p = ham[2]
        spam_p = spam[2]

        try:
            au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]),
                                                      msgs.HamStream(ham_p, [ham_p])],        # Training Ham
                                                     [msgs.SpamStream(spam_train, [spam_train]),
                                                      msgs.SpamStream(spam_p, [spam_p])],     # Training Spam
                                                     msgs.HamStream(ham_test, [ham_test]),          # Testing Ham
                                                     msgs.SpamStream(spam_test, [spam_test]),       # Testing Spam
                                                     )

            print "Unlearning..."
            cluster = ProxyCluster(au.driver.tester.train_examples[2])
            au.unlearn(cluster)
            """
            time_1 = time.time()
            for i in range(10):
                au.init_ground(update=False)
            time_2 = time.time()
            avg_no_update = float(time_2 - time_1) / 10
            no_update_rate = au.driver.tester.correct_classification_rate()
            time_3 = time.time()
            for i in range(10):
                au.init_ground(update=True)
            time_4 = time.time()
            avg_update = float(time_4 - time_3) / 10
            update_rate = au.driver.tester.correct_classification_rate()
            print "Average test without update: " + str(avg_no_update)
            print "Average test with update: " + str(avg_update)
            print "Detection rate without update: " + str(no_update_rate)
            print "Detection rate with update: " + str(update_rate)
            """
            au.init_ground(update=False)
            au.init_ground(update=True)
            au.driver.tester.correct_classification_rate()
        except KeyboardInterrupt:
            sys.exit()
Ejemplo n.º 3
0
def main():
    sets = [10]
    dest = "C:/Users/bzpru/Desktop/spambayes-1.1a6/unpollute_stats/Yang_Data_Sets (cluster features)/"

    for i in sets:
        ham = hams[i]
        spam = spams[i]
        data_set = set_dirs[i]

        if i > 10:
            ham_test = ham[1]
            spam_test = spam[1]

            ham_train = ham[0]
            spam_train = spam[0]

        else:
            ham_test = ham[0]
            spam_test = spam[0]

            ham_train = ham[1]
            spam_train = spam[1]

        ham_p = ham[2]
        spam_p = spam[2]

        ham_polluted = dir_enumerate(ham_p)
        spam_polluted = dir_enumerate(spam_p)
        train_ham = dir_enumerate(ham_train)
        train_spam = dir_enumerate(spam_train)
        test_ham = dir_enumerate(ham_test)
        test_spam = dir_enumerate(spam_test)
        total_polluted = ham_polluted + spam_polluted
        total_unpolluted = train_ham + train_spam

        time_1 = time.time()
        p_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]),
                                                   msgs.HamStream(ham_p, [ham_p])],        # Training Ham
                                                   [msgs.SpamStream(spam_train, [spam_train]),
                                                   msgs.SpamStream(spam_p, [spam_p])],     # Training Spam
                                                   msgs.HamStream(ham_test, [ham_test]),          # Testing Ham
                                                   msgs.SpamStream(spam_test, [spam_test]),       # Testing Spam
                                                   distance_opt="inv-match", all_opt=True,
                                                   update_opt="hybrid", greedy_opt=False)

        v_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]), []],
                                                   [msgs.SpamStream(spam_train, [spam_train]), []],
                                                   msgs.HamStream(ham_test, [ham_test]),
                                                   msgs.SpamStream(spam_test, [spam_test]))

        vanilla_detection_rate = v_au.current_detection_rate
        time_2 = time.time()
        train_time = seconds_to_english(time_2 - time_1)
        print "Train time:", train_time, "\n"

        with open(dest + data_set + " (unlearn_stats).txt", 'w') as outfile:
            cluster_list = stats(p_au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam],
                                 [ham_polluted, spam_polluted], total_polluted, total_unpolluted, train_time,
                                 vanilla=[vanilla_detection_rate, v_au], clusters=True)

        with open(dest + data_set + " (Separate Features).txt", 'w') as outfile:
            outfile.write("---------------------------\n")
            outfile.write("Data Set: " + data_set + "\n")
            outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n")
            outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n")
            outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) +
                          " spam.\n")
            outfile.write("---------------------------\n")
            outfile.write("\n\n")
            print_cluster_pollution(outfile, cluster_list)

        # In the hopes of keeping RAM down between iterations
        del p_au
        del v_au
Ejemplo n.º 4
0
def main():

    data_sets_dir = "C:\\Users\\Alex\\Downloads\\Data Sets"
    set_dirs = ["Mislabeled-Big"]

    hams = [seterize(data_sets_dir, set_dir, False, 3) for set_dir in set_dirs]
    spams = [seterize(data_sets_dir, set_dir, True, 3) for set_dir in set_dirs]

    num_data_sets = len(hams)
    assert (len(hams) == len(spams))

    for i in range(num_data_sets):
        ham = hams[i]
        spam = spams[i]

        ham_polluted = dir_enumerate(ham[2])
        spam_polluted = dir_enumerate(spam[2])
        train_ham = dir_enumerate(ham[1])
        train_spam = dir_enumerate(spam[1])
        test_ham = dir_enumerate(ham[0])
        test_spam = dir_enumerate(spam[0])
        total_polluted = ham_polluted + spam_polluted

        try:
            time_1 = time.time()
            au = ActiveUnlearnDriver.ActiveUnlearner(
                [
                    msgs.HamStream(ham[1], [ham[1]]),
                    msgs.HamStream(ham[2], [ham[2]])
                ],  # Training Ham
                [
                    msgs.SpamStream(spam[1], [spam[1]]),
                    msgs.SpamStream(spam[2], [spam[2]])
                ],  # Training Spam
                msgs.HamStream(ham[0], [ham[0]]),  # Testing Ham
                msgs.SpamStream(spam[0], [spam[0]]),  # Testing Spam
            )

            time_2 = time.time()
            train_time = time_2 - time_1
            print "Train time:", train_time, "\n"

            with open("C:\\Users\\Alex\\Desktop\\unpollute_stats\\big_yang_" + str(i + 1)
                      + ".txt", 'w') \
                    as outfile:
                try:
                    outfile.write("---------------------------\n")
                    outfile.write("Data Set: " + set_dirs[i] + "\n")
                    outfile.write("Vanilla Training: " + str(train_ham) +
                                  " ham and " + str(train_spam) + " spam.\n")
                    outfile.write("Testing: " + str(test_ham) + " ham and " +
                                  str(test_spam) + " spam.\n")
                    outfile.write("Pollution Training: " + str(ham_polluted) +
                                  " ham and " + str(spam_polluted) +
                                  " spam.\n")
                    outfile.write("---------------------------\n")
                    outfile.write("\n\n")
                    outfile.write("CLUSTER AND RATE COUNTS:\n")
                    outfile.write("---------------------------\n")

                    original_detection_rate = au.driver.tester.correct_classification_rate(
                    )

                    outfile.write("0: " + str(original_detection_rate) + "\n")

                    time_start = time.time()
                    cluster_list = au.greatest_impact_active_unlearn(
                        outfile, test=True, pollution_set3=True, gold=True)
                    time_end = time.time()
                    unlearn_time = time_end - time_start
                    total_polluted_unlearned = 0
                    total_unlearned = 0
                    total_unpolluted_unlearned = 0
                    final_detection_rate = au.current_detection_rate

                    print "\nTallying up final counts...\n"
                    for cluster in cluster_list:
                        cluster = cluster[1]
                        total_unlearned += cluster.size
                        total_polluted_unlearned += cluster.target_set3()
                        total_unpolluted_unlearned += (cluster.size -
                                                       cluster.target_set3())

                    outfile.write("\nSTATS\n")
                    outfile.write("---------------------------\n")
                    outfile.write("Initial Detection Rate: " +
                                  str(original_detection_rate) + "\n")
                    outfile.write("Final Detection Rate: " +
                                  str(final_detection_rate) + "\n")
                    outfile.write("Total Unlearned:\n")
                    outfile.write(str(total_unlearned) + "\n")
                    outfile.write("Polluted Percentage of Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_polluted_unlearned) /
                            float(total_unlearned)) + "\n")
                    outfile.write("Unpolluted Percentage of Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_unpolluted_unlearned) /
                            float(total_unlearned)) + "\n")
                    outfile.write("Percentage of Polluted Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_polluted_unlearned) /
                            float(total_polluted)) + "\n")
                    outfile.write("Time for training:\n")
                    outfile.write(str(train_time) + "\n")
                    outfile.write("Time for unlearning:\n")
                    outfile.write(str(unlearn_time))

                except KeyboardInterrupt:
                    outfile.flush()
                    os.fsync(outfile)
                    sys.exit()

        except KeyboardInterrupt:
            sys.exit()
Ejemplo n.º 5
0
def main():
    import os
    import sys
    from random import choice

    sys.path.insert(-1, os.getcwd())
    sys.path.insert(-1, os.path.dirname(os.getcwd()))

    from spambayes import ActiveUnlearnDriver
    from spambayes.Options import get_pathname_option
    from spambayes import msgs
    """
    from dictionarywriter import DictionaryWriter
    """

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    """
    DictionaryWriter(600).write()
    """

    keep_going = True
    trial_number = 1

    au_v = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[1], [ham[1]]),
         msgs.HamStream(ham[2], [ham[2]])],
        [
            msgs.SpamStream(spam[1], [spam[1]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ],
        msgs.HamStream(ham[0], [ham[0]]),
        msgs.SpamStream(spam[0], [spam[0]]),
    )
    while keep_going:
        msg = choice(au_v.driver.tester.train_examples[0])
        try:
            test_cl, counter = au_v.determine_cluster(msg)
            test_size = test_cl.size
            au_v.learn(test_cl)

        except TypeError:
            counter = 1
            test_size = "100, but fail"

        cluster_detection_rates_v = []
        cluster_spam_rates_v = []
        cluster_sizes = []

        au_v.init_ground()
        original_rate_v = au_v.driver.tester.correct_classification_rate()
        cluster_size = 100
        cluster_sizes.append(100)

        print "Clustering with size", cluster_size, "..."

        cl_v = ActiveUnlearnDriver.Cluster(msg, cluster_size, au_v, "extreme")
        cluster_spam_rates_v.append(
            float(cl_v.target_spam()) / float(cluster_size))
        cluster_detection_rates_v.append(au_v.start_detect_rate(cl_v))

        for i in range(1, counter + 2):
            cluster_size += 100
            cluster_sizes.append(cluster_size)

            print "Clustering with size", cluster_size, "..."

            cluster_detection_rates_v.append(
                au_v.continue_detect_rate(cl_v, 100))
            cluster_spam_rates_v.append(
                float(cl_v.target_spam()) / float(cluster_size))

        with open(
                "C:\Users\Alex\Desktop\det_cluster_stats_v" +
                str(trial_number) + ".txt", 'w') as outfile:
            outfile.write("VANILLA MACHINE\n")

            outfile.write("--------------------------\n")

            outfile.write("Clustered around: " + msg.tag + "\n")

            outfile.write("--------------------------\n")

            outfile.write("Detection Rates:\n")
            outfile.write(str(original_rate_v) + "\n")

            for item in cluster_detection_rates_v:
                outfile.write(str(item) + "\n")

            outfile.write("--------------------------\n")

            outfile.write("Spam Rate:\n")
            for item in cluster_spam_rates_v:
                outfile.write(str(item) + "\n")

            outfile.write("Test Cluster Size:\n")
            outfile.write(str(test_size))

        answer = raw_input("Keep going (y/n)? You have performed " +
                           str(trial_number) + " trials so far. ")

        if answer == "n":
            keep_going = False

        else:
            au_v.learn(cl_v)
            au_v.init_ground()
            trial_number += 1
Ejemplo n.º 6
0
def main():

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    keep_going = True
    trial_number = 1

    try:
        time_1 = time.time()
        au = ActiveUnlearnDriver.ActiveUnlearner(
            [
                msgs.HamStream(ham[1], [ham[1]]),
                msgs.HamStream(ham[2], [ham[2]])
            ],  # Training Ham
            [
                msgs.SpamStream(spam[1], [spam[1]]),
                msgs.SpamStream(spam[2], [spam[2]])
            ],  # Training Spam
            msgs.HamStream(ham[0], [ham[0]]),  # Testing Ham
            msgs.SpamStream(spam[0], [spam[0]]),  # Testing Spam
        )

        time_2 = time.time()
        train_time = time_2 - time_1
        print "Train time:", train_time, "\n"
        while keep_going:
            with open("C:\\Users\\Alex\\Desktop\\unpollute_stats\\unlearn_stats" + str(trial_number) + ".txt", 'w') \
                    as outfile:
                try:
                    outfile.write("CLUSTER AND RATE COUNTS:\n")
                    outfile.write("---------------------------\n")

                    original_detection_rate = au.driver.tester.correct_classification_rate(
                    )

                    outfile.write("0: " + str(original_detection_rate) + "\n")

                    time_start = time.time()
                    cluster_list = au.brute_force_active_unlearn(
                        outfile,
                        test=True,
                        center_iteration=False,
                        pollution_set3=True,
                        gold=True)
                    time_end = time.time()
                    unlearn_time = time_end - time_start
                    total_polluted_unlearned = 0
                    total_unlearned = 0
                    total_unpolluted_unlearned = 0
                    final_detection_rate = au.current_detection_rate

                    print "\nTallying up final counts...\n"
                    for cluster in cluster_list:
                        total_unlearned += cluster.size
                        total_polluted_unlearned += cluster.target_set3()
                        total_unpolluted_unlearned += (cluster.size -
                                                       cluster.target_set3())

                    outfile.write("\nSTATS\n")
                    outfile.write("---------------------------\n")
                    outfile.write("Initial Detection Rate: " +
                                  str(original_detection_rate) + "\n")
                    outfile.write("Final Detection Rate: " +
                                  str(final_detection_rate) + "\n")
                    outfile.write("Total Unlearned:\n")
                    outfile.write(str(total_unlearned) + "\n")
                    outfile.write("Polluted Percentage of Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_polluted_unlearned) /
                            float(total_unlearned)) + "\n")
                    outfile.write("Unpolluted Percentage of Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_unpolluted_unlearned) /
                            float(total_unlearned)) + "\n")
                    outfile.write("Percentage of Polluted Unlearned:\n")
                    outfile.write(
                        str(float(total_polluted_unlearned) / 1200) + "\n")
                    outfile.write("Time for training:\n")
                    outfile.write(str(train_time) + "\n")
                    outfile.write("Time for unlearning:\n")
                    outfile.write(str(unlearn_time))

                except KeyboardInterrupt:
                    outfile.flush()
                    os.fsync(outfile)
                    """
                    m.reset()
                    """
                    sys.exit()

            answer = raw_input("\nKeep going (y/n)? You have performed " +
                               str(trial_number) + " trial(s) so far. ")
            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    for cluster in cluster_list:
                        au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    answer = raw_input("Please enter either y or n. ")

    except KeyboardInterrupt:
        """
        m.reset()
        """
        sys.exit()
Ejemplo n.º 7
0
def main():

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 4)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 4)
    ]
    injected = get_pathname_option("TestDriver", "spam_directories") % 3

    au = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[0], [ham[0]]),
         msgs.HamStream(ham[2], [ham[2]])], [
             msgs.SpamStream(spam[0], [spam[0]]),
             msgs.SpamStream(spam[2], [spam[2]])
         ], msgs.HamStream(ham[1], [ham[1]]),
        msgs.SpamStream(spam[1], [spam[1]]))

    msg = choice(
        au.driver.tester.train_examples[2])  # Randomly chosen from Ham Set3

    original_rate = au.driver.tester.correct_classification_rate()
    cluster_sizes = []
    detection_rates = []
    target_cluster_rates = []

    sizes = []
    for i in range(150, 1050, 50):
        sizes.append(i)
    for i in range(1000, 15000, 1000):
        sizes.append(i)

    for size in sizes:
        cluster = ActiveUnlearnDriver.Cluster(msg, size, au, "extreme")
        print "Clustering with size " + str(cluster.size) + "..."
        cluster_sizes.append(size)
        detection_rates.append(au.detect_rate(cluster))
        target_cluster_rates.append(
            float(cluster.target_set3()) / float(cluster.size))

    file = open("/Users/AlexYang/Desktop/clues.txt", 'w')

    features = au.driver.classifier._getclues(msg)
    i = 1
    for feature in features:
        file.write(str(i) + ") ")
        file.write(str(feature) + "\n")
        i += 1

    with open("/Users/AlexYang/Desktop/clusterstats.txt", 'w') as outfile:

        outfile.write("Clustered around: " + msg.tag)
        outfile.write("\nOriginal Rate: " + str(original_rate) + "\n")

        outfile.write(
            tabulate(
                {
                    "Cluster Sizes": cluster_sizes,
                    "Detection Rates": detection_rates,
                    "% of Targets Clustered": target_cluster_rates
                },
                headers="keys",
                tablefmt="plain"))
Ejemplo n.º 8
0
def main():
    # sets = [11,12,13,14,15] # mislabeled_both_small
    sets = [17, 20]
    # sets = [16,17,18,19,20,21] # mislabeled_both_big
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-cv',
        '--cross',
        type=str,
        help="partition test set into T1 and T2 for cross-validation",
        choices=['random', 'features', 'mislabeled'],
        default=None)
    parser.add_argument('-f',
                        '--features',
                        nargs='*',
                        help="what features to split into T2",
                        default=None)
    parser.add_argument('-d',
                        '--dest',
                        type=str,
                        help="choose alternate destination for output file")
    parser.add_argument('-dist',
                        '--distance',
                        type=str,
                        default='frequency5',
                        choices=['frequency5', 'frequency3'],
                        help="choose a distance method")
    parser.add_argument('-hc',
                        '--ham_cutoff',
                        type=float,
                        default=.2,
                        help="choose a ham cutoff probability")
    parser.add_argument('-sc',
                        '--spam_cutoff',
                        type=float,
                        default=.8,
                        help="choose a spam cutoff probability")
    parser.add_argument('-cp',
                        '--copies',
                        type=int,
                        default=1,
                        help="number of times to copy T1")
    parser.add_argument(
        '-mc',
        '--misclassified',
        dest='misclassified',
        action='store_true',
        help="When partitioning T1, do we include only misclassified emails?")
    parser.set_defaults(misclassified=False)

    args = parser.parse_args()
    print args

    if args.dest:
        global dest
        dest += args.dest

    print "path selected: ", dest

    options['Categorization', 'ham_cutoff'] = args.ham_cutoff
    options['Categorization', 'spam_cutoff'] = args.spam_cutoff

    for i in sets:
        ham = hams[i]
        spam = spams[i]
        data_set = set_dirs[i]

        print "beginning tests on ", data_set

        if i > 10:  #Set2 is test and Set1 is training for all mislabeled datasets
            ham_test = ham[1]  # approx 20,000 test and 12,000 train
            spam_test = spam[1]

            ham_train = ham[0]
            spam_train = spam[0]

        else:
            ham_test = ham[0]  # approx 12,000 test and 20,000 train
            spam_test = spam[0]

            ham_train = ham[1]
            spam_train = spam[1]

        # the polluted data sets
        ham_p = ham[2]
        spam_p = spam[2]

        # Calculate the number of emails for polluted, train, test, and total data sets
        ham_polluted = dir_enumerate(ham_p)
        spam_polluted = dir_enumerate(spam_p)
        train_ham = dir_enumerate(ham_train)
        train_spam = dir_enumerate(spam_train)
        test_ham = dir_enumerate(ham_test)
        test_spam = dir_enumerate(spam_test)
        total_polluted = ham_polluted + spam_polluted
        total_unpolluted = train_ham + train_spam

        try:
            time_1 = time.time()  # begin timer
            # Instantiate ActiveUnlearner object
            if args.cross is not None:
                au_temp = None

                if args.cross == 'mislabeled' or args.misclassified:  # find mislabeled emails
                    print '------Gathering Mislabeled Emails------'
                    au_temp = ActiveUnlearnDriver.ActiveUnlearner(
                        [
                            msgs.HamStream(ham_train, [ham_train]),
                            msgs.HamStream(ham_p, [ham_p])
                        ],  # Training Ham 
                        [
                            msgs.SpamStream(spam_train, [spam_train]),
                            msgs.SpamStream(spam_p, [spam_p])
                        ],  # Training Spam
                        msgs.HamStream(ham_test, [ham_test]),  # Testing Ham
                        msgs.SpamStream(spam_test,
                                        [spam_test]),  # Testing Spam
                        distance_opt=args.distance,
                        all_opt=True,
                        update_opt="hybrid",
                        greedy_opt=True,
                        include_unsures=False)  # Don't unclude unsure emails
                    print '------Mislabeled Emails Gathered------'

                t1_ham, t1_spam, t2_ham, t2_spam = partitioner.partition(
                    test_ham,
                    ham_test,
                    test_spam,
                    spam_test,
                    args.cross,
                    args.features,
                    args.copies,
                    mis_only=args.misclassified,
                    au=au_temp)

                au = ActiveUnlearnDriver.ActiveUnlearner(
                    [
                        msgs.HamStream(ham_train, [ham_train]),
                        msgs.HamStream(ham_p, [ham_p])
                    ],  # Training Ham 
                    [
                        msgs.SpamStream(spam_train, [spam_train]),
                        msgs.SpamStream(spam_p, [spam_p])
                    ],  # Training Spam
                    msgs.HamStream(ham_test, [ham_test],
                                   indices=t1_ham),  # Testing Ham
                    msgs.SpamStream(spam_test, [spam_test],
                                    indices=t1_spam),  # Testing Spam
                    cv_ham=msgs.HamStream(ham_test, [ham_test],
                                          indices=t2_ham),  # T2 testing Ham
                    cv_spam=msgs.SpamStream(
                        spam_test, [spam_test],
                        indices=t2_spam),  # T2 testing Spam
                    distance_opt=args.distance,
                    all_opt=True,
                    update_opt="hybrid",
                    greedy_opt=True,
                    include_unsures=False,
                    partition_method=args.cross)  # Don't unclude unsure emails

            else:
                au = ActiveUnlearnDriver.ActiveUnlearner(
                    [
                        msgs.HamStream(ham_train, [ham_train]),
                        msgs.HamStream(ham_p, [ham_p])
                    ],  # Training Ham 
                    [
                        msgs.SpamStream(spam_train, [spam_train]),
                        msgs.SpamStream(spam_p, [spam_p])
                    ],  # Training Spam
                    msgs.HamStream(ham_test, [ham_test]),  # Testing Ham
                    msgs.SpamStream(spam_test, [spam_test]),  # Testing Spam
                    distance_opt=args.distance,
                    all_opt=True,
                    update_opt="hybrid",
                    greedy_opt=True,
                    include_unsures=False)  # Don't unclude unsure emails

            # vanilla active unlearner
            v_au = ActiveUnlearnDriver.ActiveUnlearner(
                [msgs.HamStream(ham_train, [ham_train]), []],
                [msgs.SpamStream(spam_train, [spam_train]), []],
                msgs.HamStream(ham_test, [ham_test]),
                msgs.SpamStream(spam_test, [spam_test]))

            vanilla_detection_rate = v_au.current_detection_rate

            time_2 = time.time()
            train_time = seconds_to_english(time_2 - time_1)
            print "Train time:", train_time, "\n"

            with open(dest + data_set + " (unlearn_stats).txt",
                      'w+') as outfile:
                try:
                    if args.cross == 'features' or args.cross == 'mislabeled':
                        t1_total = len(t1_ham) + len(t1_spam)
                        t2_total = len(t2_ham) + len(t2_spam)
                        print '----------------------T1/T2 TOTALS----------------------'
                        print 'Size of T1 Ham: ' + str(len(t1_ham))
                        print 'Size of T1 Spam: ' + str(len(t1_spam))
                        print 'Size of T2 Ham: ' + str(len(t2_ham))
                        print 'Size of T2 Spam: ' + str(len(t2_spam))
                        if args.cross == 'features':
                            outfile.write('Features used to distinguish T2: ' +
                                          ', '.join(args.features) + "\n")
                        if args.cross == 'mislabeled':
                            outfile.write('Ham cutoff : ' +
                                          str(args.ham_cutoff) + "\n")
                            outfile.write('Spam cutoff : ' +
                                          str(args.spam_cutoff) + "\n")
                        outfile.write('Size of T1 Ham: ' + str(len(t1_ham)) +
                                      "\n")
                        outfile.write('Size of T1 Spam: ' + str(len(t1_spam)) +
                                      "\n")
                        outfile.write('Size of T2 Ham: ' + str(len(t2_ham)) +
                                      "\n")
                        outfile.write('Size of T2 Spam: ' + str(len(t2_spam)) +
                                      "\n")
                        outfile.flush()
                        os.fsync(outfile)
                    unlearn_stats(au,
                                  args,
                                  outfile,
                                  data_set, [train_ham, train_spam],
                                  [test_ham, test_spam],
                                  [ham_polluted, spam_polluted],
                                  total_polluted,
                                  total_unpolluted,
                                  train_time, [ham_p, spam_p],
                                  vanilla=[vanilla_detection_rate, v_au],
                                  noisy_clusters=True)
                    # unlearn_stats(au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam],
                    #               [ham_polluted, spam_polluted], total_polluted, total_unpolluted,
                    #               train_time, vanilla=None, noisy_clusters=True)

                except KeyboardInterrupt:
                    outfile.flush()
                    sys.exit()

            # In the hopes of keeping RAM down between iterations
            del au
            del v_au

        except KeyboardInterrupt:
            sys.exit()
def main():
    sets = [11, 12, 13, 14, 15]

    for i in sets:
        ham = hams[i]
        spam = spams[i]
        data_set = set_dirs[i]

        if i > 10:
            ham_test = ham[1]
            spam_test = spam[1]

            ham_train = ham[0]
            spam_train = spam[0]

        else:
            ham_test = ham[0]
            spam_test = spam[0]

            ham_train = ham[1]
            spam_train = spam[1]

        ham_p = ham[2]
        spam_p = spam[2]

        ham_polluted = dir_enumerate(ham_p)
        spam_polluted = dir_enumerate(spam_p)
        train_ham = dir_enumerate(ham_train)
        train_spam = dir_enumerate(spam_train)
        test_ham = dir_enumerate(ham_test)
        test_spam = dir_enumerate(spam_test)
        total_polluted = ham_polluted + spam_polluted
        total_unpolluted = train_ham + train_spam

        try:
            time_1 = time.time()
            au = ActiveUnlearnDriver.ActiveUnlearner(
                [
                    msgs.HamStream(ham_train, [ham_train]),
                    msgs.HamStream(ham_p, [ham_p])
                ],  # Training Ham
                [
                    msgs.SpamStream(spam_train, [spam_train]),
                    msgs.SpamStream(spam_p, [spam_p])
                ],  # Training Spam
                msgs.HamStream(ham_test, [ham_test]),  # Testing Ham
                msgs.SpamStream(spam_test, [spam_test]),  # Testing Spam
                distance_opt="inverse",
                all_opt=True,
                update_opt="hybrid",
                greedy_opt=False)

            v_au = ActiveUnlearnDriver.ActiveUnlearner(
                [msgs.HamStream(ham_train, [ham_train]), []],
                [msgs.SpamStream(spam_train, [spam_train]), []],
                msgs.HamStream(ham_test, [ham_test]),
                msgs.SpamStream(spam_test, [spam_test]))

            vanilla_detection_rate = v_au.current_detection_rate

            time_2 = time.time()
            train_time = seconds_to_english(time_2 - time_1)
            print "Train time:", train_time, "\n"

            dest = "C:/Users/bzpru/Desktop/spambayes-1.1a6/unpollute_stats/Yang_Data_Sets (inverse)/" \
                   "Hybrid Update - Nongreedy/Noisy/"

            with open(dest + data_set + " (unlearn_stats).txt",
                      'w') as outfile:
                try:
                    unlearn_stats(au,
                                  outfile,
                                  data_set, [train_ham, train_spam],
                                  [test_ham, test_spam],
                                  [ham_polluted, spam_polluted],
                                  total_polluted,
                                  total_unpolluted,
                                  train_time,
                                  vanilla=[vanilla_detection_rate, v_au],
                                  noisy_clusters=True)

                except KeyboardInterrupt:
                    outfile.flush()
                    sys.exit()

            # In the hopes of keeping RAM down between iterations
            del au
            del v_au

        except KeyboardInterrupt:
            sys.exit()
Ejemplo n.º 10
0
def drive():
    print options.display()

    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]

    d = dictionarywriter.DictionaryWriter(150, 4)
    d.write()

    keep_going = True
    trial_number = 1

    au = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[1], [ham[1]]),
         msgs.HamStream(ham[2], [ham[2]])],
        [
            msgs.SpamStream(spam[1], [spam[1]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ],
        msgs.HamStream(ham[0], [ham[0]]),
        msgs.SpamStream(spam[0], [spam[0]]),
    )
    with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt",
              'w') as outfile:

        while keep_going:
            chosen = set()
            current = au.select_initial()
            cluster = au.determine_cluster(current)
            chosen.add(current)
            au.driver.test(au.testing_ham, au.testing_spam)

            while not cluster:
                current = au.select_initial(chosen)
                cluster = au.determine_cluster(current)
                chosen.add(current)
                au.driver.test(au.testing_ham, au.testing_spam)

            cluster_list = list(cluster.cluster_set)

            dicts = au.driver.tester.train_examples[2]

            data = v_correlation(cluster_list, dicts)

            outfile.write("Trial " + str(trial_number) +
                          " Percentage Overlap (Correlation): " + str(data))
            answer = raw_input("Keep going (y/n)? You have performed " +
                               str(trial_number) + " trial(s) so far. ")

            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    print "Please enter either y or n."
Ejemplo n.º 11
0
def main():
    sets = [1, 2, 3, 4]
    dest = "C:/Users/bzpru/Desktop/spambayes-1.1a6/unpollute_stats/Yang_Data_Sets (inverse)/Hybrid Update - Nongreedy/"

    for i in sets:
        ham = hams[i]
        spam = spams[i]
        data_set = set_dirs[i]

        if i > 10:
            ham_test = ham[1]
            spam_test = spam[1]

            ham_train = ham[0]
            spam_train = spam[0]

        else:
            ham_test = ham[0]
            spam_test = spam[0]

            ham_train = ham[1]
            spam_train = spam[1]

        ham_p = ham[2]
        spam_p = spam[2]

        ham_polluted = dir_enumerate(ham_p)
        spam_polluted = dir_enumerate(spam_p)
        train_ham = dir_enumerate(ham_train)
        train_spam = dir_enumerate(spam_train)
        test_ham = dir_enumerate(ham_test)
        test_spam = dir_enumerate(spam_test)
        total_polluted = ham_polluted + spam_polluted
        total_unpolluted = train_ham + train_spam

        time_1 = time.time()
        p_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]),
                                                   msgs.HamStream(ham_p, [ham_p])],        # Training Ham
                                                   [msgs.SpamStream(spam_train, [spam_train]),
                                                   msgs.SpamStream(spam_p, [spam_p])],     # Training Spam
                                                   msgs.HamStream(ham_test, [ham_test]),          # Testing Ham
                                                   msgs.SpamStream(spam_test, [spam_test]),       # Testing Spam
                                                   distance_opt="inv-match", all_opt=True,
                                                   update_opt="hybrid", greedy_opt=False)
        time_2 = time.time()
        train_time = seconds_to_english(time_2 - time_1)
        print "Train time:", train_time, "\n"

        v_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]),
                                                    []],
                                                   [msgs.SpamStream(spam_train, [spam_train]),
                                                    []],
                                                   msgs.HamStream(ham_test, [ham_test]),
                                                   msgs.SpamStream(spam_test, [spam_test]))

        p_c = p_au.driver.tester.classifier
        v_c = p_au.driver.tester.classifier
        words = set().union(set(p_c.wordinfo.keys()), set(v_c.wordinfo.keys()))
        p_pair = au_sig_words(p_au, words)
        v_pair = au_sig_words(v_au, words)

        with open(dest + data_set + " (unlearn_stats).txt", 'w') as outfile:
            stats(p_au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam],
                  [ham_polluted, spam_polluted], total_polluted, total_unpolluted, train_time)

        words = words.union(set(p_c.wordinfo.keys()))
        u_pair = au_sig_words(p_au, words)

        features, sigs = extract_features([v_pair, p_pair, u_pair])
        feature_matrix = feature_lists(sigs, 1)

        combined_matrix = [["", "Unpolluted", "Polluted", "Unlearned 1"]] + [[str(column) for column in feature]
                                                                             for feature in features]

        feature_col_width = max(len(row[1]) for row in feature_matrix) + 2
        combined_col_width = max(len(item) for row in combined_matrix for item in row) + 2
        feature_num_col_width = max(len(row[0]) for row in feature_matrix) + 2

        with open(dest + data_set + " (Separate Features).txt", 'w') as outfile:
            outfile.write("---------------------------\n")
            outfile.write("Data Set: " + data_set + "\n")
            outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n")
            outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n")
            outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) +
                          " spam.\n")
            outfile.write("---------------------------\n")
            outfile.write("\n\n")
            outfile.write("Unpolluted and Polluted Most Significant Features:\n")
            outfile.write("---------------------------\n")
            for row in feature_matrix:
                justify = [row[0].ljust(feature_num_col_width)]
                for j in range(1, len(row)):
                    justify.append(row[j].strip().ljust(feature_col_width))
                outfile.write("".join(justify) + "\n")

        with open(dest + data_set + " (Combined Features).txt", 'w') as outfile:
            outfile.write("---------------------------\n")
            outfile.write("Data Set: " + data_set + "\n")
            outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n")
            outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n")
            outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) +
                          " spam.\n")
            outfile.write("---------------------------\n")
            outfile.write("\n\n")
            outfile.write("Feature Comparison:\n")
            outfile.write("---------------------------\n")

            for row in combined_matrix:
                outfile.write("".join(word.strip().ljust(combined_col_width) for word in row) + "\n")
Ejemplo n.º 12
0
def main():
    import os
    import sys
    import shutil

    sys.path.insert(-1, os.getcwd())
    sys.path.insert(-1, os.path.dirname(os.getcwd()))

    from spambayes import ActiveUnlearnDriver
    from spambayes.Options import get_pathname_option
    from spambayes import msgs
    import time

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]

    for i in range(1):
        au = ActiveUnlearnDriver.ActiveUnlearnDriver([
            msgs.HamStream(ham[0], [ham[0]]),
            msgs.HamStream(ham[2], [ham[2]]),
            msgs.HamStream(ham[3], [ham[3]])
        ], [
            msgs.SpamStream(spam[0], [spam[0]]),
            msgs.SpamStream(spam[2], [spam[2]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ], msgs.HamStream(ham[2],
                          [ham[2]]), msgs.SpamStream(spam[2], [spam[2]]),
                                                     "ac-extreme")

        au.driver.test(msgs.HamStream(ham[0], [ham[0]]),
                       msgs.SpamStream(spam[0], [spam[0]]))
        au.driver.untrain(msgs.HamStream(ham[2], [ham[2]]),
                          msgs.SpamStream(spam[2], [spam[2]]))
        au.driver.untrain(msgs.HamStream(ham[3], [ham[3]]),
                          msgs.SpamStream(spam[3], [spam[3]]))
        au.driver.test(msgs.HamStream(ham[0], [ham[0]]),
                       msgs.SpamStream(spam[0], [spam[0]]))
        msg = au.driver.tester.test_examples[5]

        shutil.copy(msg.tag, "C:\Users\Alex\Desktop\clustera")
        print msg.prob

        start_time = time.time()
        cluster = (au.cluster(msg, 10))
        end_time = time.time()
        print cluster

        clueslist = []
        for clue in msg.clues:
            clueslist.append((clue[0], clue[1]))
        print clueslist

        with open("C:\Users\Alex\Desktop\clustera\cluster7.txt",
                  'w') as outfile:
            spamcounter = 0
            for sim in cluster:
                with open(sim.tag) as infile:
                    if sim.tag.endswith(".spam.txt"):
                        outfile.write("SPAMSPAMSPAMSPAMSPAM" + "\n\n")
                    if sim.tag.endswith(".ham.txt"):
                        outfile.write("HAMHAMHAMHAMHAM" + "\n\n")

                    outfile.write(infile.read())
                    outfile.write("\n\n" +
                                  "----------------------------------------" +
                                  "\n\n")

                if sim.tag.endswith(".spam.txt"):
                    spamcounter += 1

            print spamcounter

        print end_time - start_time