def __init__(self, msg, size, active_unlearner, distance_opt, working_set=None, sort_first=True, separate=True): self.clustroid = msg # seed of the cluster if msg.train == 1 or msg.train == 3: # if ham set1 or ham set3 self.train = [1, 3] elif msg.train == 0 or msg.train == 2: # if spam set1 or spam set3 self.train = [0, 2] self.common_features = [] self.msg_index = {} self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.sort_first = sort_first self.opt = distance_opt self.working_set = working_set # if 'frequency' in self.opt: # self.working_set = [train for train in working_set] # else: # self.working_set = working_set self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = helpers.get_word_frequencies(self.clustroid) self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array(self.separate) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster() # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def weighted_initial(self, working_set, mislabeled): if mislabeled is None: # Note that mislabeled is sorted in descending order by fabs(.50-email.prob) mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) possible_centroids = list(mislabeled - self.mislabeled_chosen) print len(possible_centroids), " mislabeled emails remaining as possible cluster centroids" if len(possible_centroids) == 0: #No more centers to select return NO_CENTROIDS else: possible_centroids.sort(key=lambda x: fabs(.50-x.prob), reverse=True) mislabeled_point = possible_centroids[0] # Choose most potent mislabeled email self.mislabeled_chosen.add(mislabeled_point) print "Chose the mislabeled point: ", mislabeled_point.tag print "Probability: ", mislabeled_point.prob init_email = None training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set if "frequency" in self.distance_opt: min_distance = sys.maxint mislabeled_point_frequencies = helpers.get_word_frequencies(mislabeled_point) for email in training: current_distance = distance(email, mislabeled_point_frequencies, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance elif self.distance_opt == "intersection": min_distance = -1 for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance > min_distance: init_email = email min_distance = current_distance else: min_distance = sys.maxint for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance print type(init_email) if init_email is None: print "Training emails remaining: ", training else: print "-> selected ", init_email.tag, " as cluster centroid with distance of ", min_distance, " from mislabeled point" return init_email
def __init__(self, msg, size, active_unlearner, distance_opt, working_set=None, sort_first=True, separate=True): self.clustroid = msg # seed of the cluster if msg.train == 1 or msg.train == 3: # if ham set1 or ham set3 self.train = [1, 3] elif msg.train == 0 or msg.train == 2: # if spam set1 or spam set3 self.train = [0, 2] self.common_features = [] self.msg_index = {} self.separate = separate self.size = size # arbitrarily set to 100 self.active_unlearner = active_unlearner # point to calling au instance self.sort_first = sort_first self.opt = distance_opt self.working_set = working_set # if 'frequency' in self.opt: # self.working_set = [train for train in working_set] # else: # self.working_set = working_set self.ham = set() self.spam = set() if 'frequency' in self.opt: self.cluster_word_frequency = helpers.get_word_frequencies( self.clustroid) self.added = [] # keeps track of order emails are added self.dist_list = self.distance_array( self.separate ) # returns list containing dist from all emails in phantom space to center clustroid self.cluster_set = self.make_cluster( ) # adds closest emails to cluster self.divide() # adds cluster emails to ham and spam
def weighted_initial(self, working_set, mislabeled): if mislabeled is None: # Note that mislabeled is sorted in descending order by fabs(.50-email.prob) mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) possible_centroids = list(mislabeled - self.mislabeled_chosen) print len( possible_centroids ), " mislabeled emails remaining as possible cluster centroids" if len(possible_centroids) == 0: #No more centers to select return NO_CENTROIDS else: possible_centroids.sort(key=lambda x: fabs(.50 - x.prob), reverse=True) mislabeled_point = possible_centroids[ 0] # Choose most potent mislabeled email self.mislabeled_chosen.add(mislabeled_point) print "Chose the mislabeled point: ", mislabeled_point.tag print "Probability: ", mislabeled_point.prob init_email = None training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set if "frequency" in self.distance_opt: min_distance = sys.maxint mislabeled_point_frequencies = helpers.get_word_frequencies( mislabeled_point) for email in training: current_distance = distance(email, mislabeled_point_frequencies, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance elif self.distance_opt == "intersection": min_distance = -1 for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance > min_distance: init_email = email min_distance = current_distance else: min_distance = sys.maxint for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance print type(init_email) if init_email is None: print "Training emails remaining: ", training else: print "-> selected ", init_email.tag, " as cluster centroid with distance of ", min_distance, " from mislabeled point" return init_email