def unlearn(self, cluster): """Unlearns a cluster from the ActiveUnlearner.""" print "unlearning cluster of size ", len(cluster.cluster_set), " from au" if len(cluster.ham) + len(cluster.spam) != cluster.size: print "\nUpdating cluster ham and spam sets...\n" cluster.divide() h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x], cluster.cluster_set)
def divide_new_elements(self, messages, unlearn, original=None): """Divides a given set of emails to be unlearned into ham and spam lists and unlearns both. Param: messages contains indices of emails to learn/unlearn """ if unlearn: h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x], messages) else: h.relearn([self.train_y, self.train_x, self.pol_y, self.pol_x], original, messages)
def unlearn(self, cluster): """Unlearns a cluster from the ActiveUnlearner.""" print "unlearning cluster of size ", len( cluster.cluster_set), " from au" if len(cluster.ham) + len(cluster.spam) != cluster.size: print "\nUpdating cluster ham and spam sets...\n" cluster.divide() h.unlearn([self.train_y, self.train_x, self.pol_y, self.pol_x], cluster.cluster_set)
def cluster_au(au, gold=True): """Clusters the training space of an ActiveUnlearner and returns the list of clusters.""" print "\n----------------------Beginning the Clustering Process-----------------------\n" cluster_list = [] # list of tuples (net_rate_change, cluster) train_y = copy.deepcopy(au.train_y) train_x = copy.deepcopy(au.train_x) pol_y = copy.deepcopy(au.pol_y) pol_x = copy.deepcopy(au.pol_x) training = [train_y, train_x, pol_y, pol_x] # create the working set original_training_size = len(h.strip(pol_y)) + len(h.strip(train_y)) print "\nResetting mislabeled...\n" mislabeled = au.get_mislabeled(update=True) # gets an array of all false positives, false negatives au.mislabeled_chosen = [] # reset set of clustered mislabeled emails in this instance of au print "\n Clustering...\n" pre_cluster_rate = au.current_detection_rate training_size = len(h.strip(pol_y)) + len(h.strip(train_y)) while training_size > 0: # loop until all emails in phantom training space have been assigned print "\n-----------------------------------------------------\n" print "\n" + str(training_size) + " emails out of " + str(original_training_size) + \ " still unclustered.\n" # Choose an arbitrary email from the mislabeled emails and returns the training email closest to it. # Final call and source of current_seed is mislabeled_initial() function # current_seed = cluster_methods(au, "mislabeled", training, mislabeled) current_seed = None label = None while current_seed is None: label, init_pos, current_seed = au.select_initial(mislabeled, "weighted", training) if str(current_seed) == 'NO_CENTROIDS': cluster_result = cluster_remaining(au, training) else: cluster_result = determine_cluster(current_seed, au, label, init_pos, working_set=training, gold=gold) # if true, relearn clusters after returning them if cluster_result is None: print "!!!How did this happen?????" sys.exit(cluster_result) net_rate_change, cluster = cluster_result # After getting the cluster and net_rate_change, you relearn the cluster in original dataset if impact=True post_cluster_rate = au.current_detection_rate # make sure the cluster was properly relearned # assert(post_cluster_rate == pre_cluster_rate), str(pre_cluster_rate) + " " + str(post_cluster_rate) # print "cluster relearned successfully: au detection rate back to ", post_cluster_rate cluster_list.append([net_rate_change, cluster]) print "\nRemoving cluster from shuffled training set...\n" h.unlearn(training, cluster.cluster_set) training_size = len(h.strip(pol_y)) + len(h.strip(train_y)) cluster_list.sort() # sorts by net_rate_change print "\nClustering process done and sorted.\n" return cluster_list