def weighted_initial(self, working_set, mislabeled): print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) print len(mislabeled), " mislabeled emails remaining as possible cluster centroids" if len(mislabeled) == 0: #No more centers to select return (None, None, 'NO_CENTROIDS') else: prob, mislabeled_point = mislabeled.pop(0) # Choose most potent mislabeled email self.mislabeled_chosen.append(mislabeled_point) print "Chose the mislabeled point with z = ", prob data_y, data_x = h.compose_set(working_set) vec_data_x = vectorize_set(data_x) init_email = None init_pos = None label = None if "frequency" in self.distance_opt: min_distance = sys.maxint for i,email_indices in enumerate(vec_data_x): if None not in email_indices: # actual data current_distance = distance(email_indices, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = data_x[i] init_pos = i min_distance = current_distance if init_email is None: print "Training emails remaining: ", len(data_x) else: label = data_y[init_pos] print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point" return (label, init_pos, init_email)
def __init__(self, msg, size, active_unlearner, label, distance_opt, working_set=None, separate=True): # Clustroid specs self.clustroid = msg[1] # index of msg self.label = label self.common_features = [] self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.opt = distance_opt # The data self.working_set = working_set self.train_y = self.working_set[0] self.train_x = self.working_set[1] self.pol_y = self.working_set[2] self.pol_x = self.working_set[3] self.data_y, self.data_x = h.compose_set(self.working_set) time_1 = time.time() self.vec_data_x = vectorize_set(self.data_x) print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1) self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = msg[0] # actual vector representation of msg self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster() # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def __init__(self, msg, size, active_unlearner, label, distance_opt, working_set=None, separate=True): # Clustroid specs self.clustroid = msg[1] # index of msg self.label = label self.common_features = [] self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.opt = distance_opt # The data self.working_set = working_set self.train_y = self.working_set[0] self.train_x = self.working_set[1] self.pol_y = self.working_set[2] self.pol_x = self.working_set[3] self.data_y, self.data_x = h.compose_set(self.working_set) time_1 = time.time() self.vec_data_x = vectorize_set(self.data_x) print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1) self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = msg[ 0] # actual vector representation of msg self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array( self.separate ) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster( ) # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def weighted_initial(self, working_set, mislabeled): print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) print len( mislabeled ), " mislabeled emails remaining as possible cluster centroids" if len(mislabeled) == 0: #No more centers to select return (None, None, 'NO_CENTROIDS') else: prob, mislabeled_point = mislabeled.pop( 0) # Choose most potent mislabeled email self.mislabeled_chosen.append(mislabeled_point) print "Chose the mislabeled point with z = ", prob data_y, data_x = h.compose_set(working_set) vec_data_x = vectorize_set(data_x) init_email = None init_pos = None label = None if "frequency" in self.distance_opt: min_distance = sys.maxint for i, email_indices in enumerate(vec_data_x): if None not in email_indices: # actual data current_distance = distance(email_indices, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = data_x[i] init_pos = i min_distance = current_distance if init_email is None: print "Training emails remaining: ", len(data_x) else: label = data_y[init_pos] print "-> selected cluster centroid with label: ", label, " and distance: ", min_distance, " from mislabeled point" return (label, init_pos, init_email)
def cluster_remaining(au, working_set): """ This function is called if weighted_initial returns NO_CENTROIDS, meaning there are no more misabeled emails to use as centers. The remaining emails in the working set are then returned as one cluster. """ print "No more cluster centroids, grouping all remaining emails into one cluster" first_state_rate = au.current_detection_rate size = len(h.strip(working_set[0] + working_set[2])) # get number of remaining emails init_email = None init_pos = None label = None data_y, data_x = h.compose_set(working_set) for i,l in enumerate(data_y): # loop to find first email that is not none if l is not None: label = l init_pos = i init_email = data_x[i] center = (init_email, init_pos) cluster = Cluster(center, size, au, label, au.distance_opt, working_set=working_set) au.unlearn(cluster) au.init_ground() new_detection_rate = au.current_detection_rate au.learn(cluster) # relearn cluster in real training space so deltas of future cluster are not influenced second_state_rate = au.current_detection_rate net_rate_change = second_state_rate - first_state_rate au.current_detection_rate = first_state_rate assert(au.current_detection_rate == first_state_rate), str(au.current_detection_rate) + " " + str(first_state_rate) print "clustered remaining with a net rate change of ", second_state_rate, " - ", first_state_rate, " = ", net_rate_change return net_rate_change, cluster