def weighted_initial(self, working_set, mislabeled):
        print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen)

        print len(mislabeled), " mislabeled emails remaining as possible cluster centroids" 
        if len(mislabeled) == 0: #No more centers to select
            return (None, None, 'NO_CENTROIDS')
        else:
            prob, mislabeled_point = mislabeled.pop(0) # Choose most potent mislabeled email 
            self.mislabeled_chosen.append(mislabeled_point)

            print "Chose the mislabeled point with z = ", prob

            data_y, data_x = h.compose_set(working_set)
            vec_data_x = vectorize_set(data_x)

            init_email = None
            init_pos = None
            label = None
            if "frequency" in self.distance_opt:
                min_distance = sys.maxint
                for i,email_indices in enumerate(vec_data_x):
                    if None not in email_indices: # actual data
                        current_distance = distance(email_indices, mislabeled_point, self.distance_opt)
                        if current_distance < min_distance:
                            init_email = data_x[i]
                            init_pos = i
                            min_distance = current_distance

            if init_email is None:
                print "Training emails remaining: ", len(data_x)
            else:
                label = data_y[init_pos]
                print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point"
            return (label, init_pos, init_email)
Ejemplo n.º 2
0
    def __init__(self, msg, size, active_unlearner, label, distance_opt, 
                working_set=None, separate=True):
        # Clustroid specs
        self.clustroid = msg[1] # index of msg
        self.label = label
        self.common_features = []
        self.separate = separate
        self.size = size # arbitrarily set to 100
        self.active_unlearner = active_unlearner # point to calling au instance
        self.opt = distance_opt

        # The data
        self.working_set = working_set
        self.train_y = self.working_set[0]
        self.train_x = self.working_set[1]
        self.pol_y = self.working_set[2]
        self.pol_x = self.working_set[3]
        self.data_y, self.data_x = h.compose_set(self.working_set)
        time_1 = time.time()
        self.vec_data_x = vectorize_set(self.data_x)
        print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1)

        self.ham = set()
        self.spam = set()

        if 'frequency' in self.opt:
            self.cluster_word_frequency = msg[0] # actual vector representation of msg
            self.added = [] # keeps track of order emails are added

        self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster() # adds closest emails to cluster
        self.divide() # adds cluster emails to ham and spam
Ejemplo n.º 3
0
    def __init__(self,
                 msg,
                 size,
                 active_unlearner,
                 label,
                 distance_opt,
                 working_set=None,
                 separate=True):
        # Clustroid specs
        self.clustroid = msg[1]  # index of msg
        self.label = label
        self.common_features = []
        self.separate = separate
        self.size = size  # arbitrarily set to 100
        self.active_unlearner = active_unlearner  # point to calling au instance
        self.opt = distance_opt

        # The data
        self.working_set = working_set
        self.train_y = self.working_set[0]
        self.train_x = self.working_set[1]
        self.pol_y = self.working_set[2]
        self.pol_x = self.working_set[3]
        self.data_y, self.data_x = h.compose_set(self.working_set)
        time_1 = time.time()
        self.vec_data_x = vectorize_set(self.data_x)
        print 'Vectorizing data_x took: ', h.sec_to_english(time.time() -
                                                            time_1)

        self.ham = set()
        self.spam = set()

        if 'frequency' in self.opt:
            self.cluster_word_frequency = msg[
                0]  # actual vector representation of msg
            self.added = []  # keeps track of order emails are added

        self.dist_list = self.distance_array(
            self.separate
        )  # returns list containing dist from all emails in phantom space to center clustroid
        self.cluster_set = self.make_cluster(
        )  # adds closest emails to cluster
        self.divide()  # adds cluster emails to ham and spam
Ejemplo n.º 4
0
    def weighted_initial(self, working_set, mislabeled):
        print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen)

        print len(
            mislabeled
        ), " mislabeled emails remaining as possible cluster centroids"
        if len(mislabeled) == 0:  #No more centers to select
            return (None, None, 'NO_CENTROIDS')
        else:
            prob, mislabeled_point = mislabeled.pop(
                0)  # Choose most potent mislabeled email
            self.mislabeled_chosen.append(mislabeled_point)

            print "Chose the mislabeled point with z = ", prob

            data_y, data_x = h.compose_set(working_set)
            vec_data_x = vectorize_set(data_x)

            init_email = None
            init_pos = None
            label = None
            if "frequency" in self.distance_opt:
                min_distance = sys.maxint
                for i, email_indices in enumerate(vec_data_x):
                    if None not in email_indices:  # actual data
                        current_distance = distance(email_indices,
                                                    mislabeled_point,
                                                    self.distance_opt)
                        if current_distance < min_distance:
                            init_email = data_x[i]
                            init_pos = i
                            min_distance = current_distance

            if init_email is None:
                print "Training emails remaining: ", len(data_x)
            else:
                label = data_y[init_pos]
                print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point"
            return (label, init_pos, init_email)