def weighted_initial(self, working_set, mislabeled):
        print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen)

        print len(mislabeled), " mislabeled emails remaining as possible cluster centroids" 
        if len(mislabeled) == 0: #No more centers to select
            return (None, None, 'NO_CENTROIDS')
        else:
            prob, mislabeled_point = mislabeled.pop(0) # Choose most potent mislabeled email 
            self.mislabeled_chosen.append(mislabeled_point)

            print "Chose the mislabeled point with z = ", prob

            data_y, data_x = h.compose_set(working_set)
            vec_data_x = vectorize_set(data_x)

            init_email = None
            init_pos = None
            label = None
            if "frequency" in self.distance_opt:
                min_distance = sys.maxint
                for i,email_indices in enumerate(vec_data_x):
                    if None not in email_indices: # actual data
                        current_distance = distance(email_indices, mislabeled_point, self.distance_opt)
                        if current_distance < min_distance:
                            init_email = data_x[i]
                            init_pos = i
                            min_distance = current_distance

            if init_email is None:
                print "Training emails remaining: ", len(data_x)
            else:
                label = data_y[init_pos]
                print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point"
            return (label, init_pos, init_email)
Esempio n. 2
0
    def __init__(self, msg, size, active_unlearner, label, distance_opt, 
                working_set=None, separate=True):
        # Clustroid specs
        self.clustroid = msg[1] # index of msg
        self.label = label
        self.common_features = []
        self.separate = separate
        self.size = size # arbitrarily set to 100
        self.active_unlearner = active_unlearner # point to calling au instance
        self.opt = distance_opt

        # The data
        self.working_set = working_set
        self.train_y = self.working_set[0]
        self.train_x = self.working_set[1]
        self.pol_y = self.working_set[2]
        self.pol_x = self.working_set[3]
        self.data_y, self.data_x = h.compose_set(self.working_set)
        time_1 = time.time()
        self.vec_data_x = vectorize_set(self.data_x)
        print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1)

        self.ham = set()
        self.spam = set()

        if 'frequency' in self.opt:
            self.cluster_word_frequency = msg[0] # actual vector representation of msg
            self.added = [] # keeps track of order emails are added

        self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster() # adds closest emails to cluster
        self.divide() # adds cluster emails to ham and spam
Esempio n. 3
0
    def __init__(self,
                 msg,
                 size,
                 active_unlearner,
                 label,
                 distance_opt,
                 working_set=None,
                 separate=True):
        # Clustroid specs
        self.clustroid = msg[1]  # index of msg
        self.label = label
        self.common_features = []
        self.separate = separate
        self.size = size  # arbitrarily set to 100
        self.active_unlearner = active_unlearner  # point to calling au instance
        self.opt = distance_opt

        # The data
        self.working_set = working_set
        self.train_y = self.working_set[0]
        self.train_x = self.working_set[1]
        self.pol_y = self.working_set[2]
        self.pol_x = self.working_set[3]
        self.data_y, self.data_x = h.compose_set(self.working_set)
        time_1 = time.time()
        self.vec_data_x = vectorize_set(self.data_x)
        print 'Vectorizing data_x took: ', h.sec_to_english(time.time() -
                                                            time_1)

        self.ham = set()
        self.spam = set()

        if 'frequency' in self.opt:
            self.cluster_word_frequency = msg[
                0]  # actual vector representation of msg
            self.added = []  # keeps track of order emails are added

        self.dist_list = self.distance_array(
            self.separate
        )  # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster(
        )  # adds closest emails to cluster
        self.divide()  # adds cluster emails to ham and spam
Esempio n. 4
0
    def weighted_initial(self, working_set, mislabeled):
        print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen)

        print len(
            mislabeled
        ), " mislabeled emails remaining as possible cluster centroids"
        if len(mislabeled) == 0:  #No more centers to select
            return (None, None, 'NO_CENTROIDS')
        else:
            prob, mislabeled_point = mislabeled.pop(
                0)  # Choose most potent mislabeled email
            self.mislabeled_chosen.append(mislabeled_point)

            print "Chose the mislabeled point with z = ", prob

            data_y, data_x = h.compose_set(working_set)
            vec_data_x = vectorize_set(data_x)

            init_email = None
            init_pos = None
            label = None
            if "frequency" in self.distance_opt:
                min_distance = sys.maxint
                for i, email_indices in enumerate(vec_data_x):
                    if None not in email_indices:  # actual data
                        current_distance = distance(email_indices,
                                                    mislabeled_point,
                                                    self.distance_opt)
                        if current_distance < min_distance:
                            init_email = data_x[i]
                            init_pos = i
                            min_distance = current_distance

            if init_email is None:
                print "Training emails remaining: ", len(data_x)
            else:
                label = data_y[init_pos]
                print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point"
            return (label, init_pos, init_email)
Esempio n. 5
0
def cluster_remaining(au, working_set):
    """ This function is called if weighted_initial returns NO_CENTROIDS, meaning there are no more misabeled emails to use as centers.
    The remaining emails in the working set are then returned as one cluster.
    """

    print "No more cluster centroids, grouping all remaining emails into one cluster"

    first_state_rate = au.current_detection_rate

    size = len(h.strip(working_set[0] + working_set[2])) # get number of remaining emails
    init_email = None
    init_pos = None
    label = None
    data_y, data_x = h.compose_set(working_set)
    for i,l in enumerate(data_y): # loop to find first email that is not none 
        if l is not None:
            label = l
            init_pos = i
            init_email = data_x[i]
    center = (init_email, init_pos)

    cluster = Cluster(center, size, au, label, au.distance_opt, working_set=working_set)

    au.unlearn(cluster)
    au.init_ground()
    new_detection_rate = au.current_detection_rate

    au.learn(cluster) # relearn cluster in real training space so deltas of future cluster are not influenced
    second_state_rate = au.current_detection_rate
    
    net_rate_change = second_state_rate - first_state_rate
    au.current_detection_rate = first_state_rate

    assert(au.current_detection_rate == first_state_rate), str(au.current_detection_rate) + " " + str(first_state_rate)
    print "clustered remaining with a net rate change of ", second_state_rate, " - ", first_state_rate, " = ", net_rate_change
    
    return net_rate_change, cluster