def __init__(self, msg, size, active_unlearner, label, distance_opt, working_set=None, separate=True): # Clustroid specs self.clustroid = msg[1] # index of msg self.label = label self.common_features = [] self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.opt = distance_opt # The data self.working_set = working_set self.train_y = self.working_set[0] self.train_x = self.working_set[1] self.pol_y = self.working_set[2] self.pol_x = self.working_set[3] self.data_y, self.data_x = h.compose_set(self.working_set) time_1 = time.time() self.vec_data_x = vectorize_set(self.data_x) print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1) self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = msg[0] # actual vector representation of msg self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster() # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def update_dist_list(self, t=False): """Updates self.dist_list for the frequency method""" if t: time_1 = time.time() indices = [train[1] for train in self.dist_list] # get array of indices self.dist_list = [(distance(self.vec_data_x[i], self.cluster_word_frequency, self.opt), i) for i in indices] self.dist_list.sort() if t: time_2 = time.time() print 'update_dist_list took: ', h.sec_to_english(time_2 - time_1)
def __init__(self, msg, size, active_unlearner, label, distance_opt, working_set=None, separate=True): # Clustroid specs self.clustroid = msg[1] # index of msg self.label = label self.common_features = [] self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.opt = distance_opt # The data self.working_set = working_set self.train_y = self.working_set[0] self.train_x = self.working_set[1] self.pol_y = self.working_set[2] self.pol_x = self.working_set[3] self.data_y, self.data_x = h.compose_set(self.working_set) time_1 = time.time() self.vec_data_x = vectorize_set(self.data_x) print 'Vectorizing data_x took: ', h.sec_to_english(time.time() - time_1) self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = msg[ 0] # actual vector representation of msg self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array( self.separate ) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster( ) # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam