def cluster_more(self, n): """Expands the cluster to include n more emails and returns these additional emails. If n more is not available, cluster size is simply truncated to include all remaining emails.""" if 'frequency' in self.opt: if n >= len(self.dist_list): n = len(self.dist_list) print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt, " method" self.size += n new_elements = [] added = 0 while added < n: nearest = self.dist_list[0][1] # get nearest email new_elements.append(nearest) # add to new list self.added.append(nearest) self.cluster_set.add(nearest) # add to original cluster set self.cluster_word_frequency = helpers.update_word_frequencies(self.cluster_word_frequency, nearest) # update word frequencies # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list del self.dist_list[0] self.update_dist_list() added += 1 assert(len(new_elements) == n), str(len(new_elements)) + " " + str(n) assert(len(self.cluster_set) == self.size), str(len(self.cluster_set)) + " " + str(self.size) for msg in new_elements: if msg.train == 1 or msg.train == 3: self.ham.add(msg) elif msg.train == 0 or msg.train == 2: self.spam.add(msg) return new_elements old_cluster_set = self.cluster_set if self.size + n <= len(self.dist_list): self.size += n else: print "\nTruncating cluster size...\n" if len(self.dist_list) > 0: self.size = len(self.dist_list) if self.sort_first: new_cluster_set = set(item[1] for item in self.dist_list[:self.size]) else: k_smallest = quickselect.k_smallest new_cluster_set = set(item[1] for item in k_smallest(self.dist_list, self.size)) new_elements = list(item for item in new_cluster_set if item not in old_cluster_set) self.cluster_set = new_cluster_set assert(len(self.cluster_set) == self.size), len(self.cluster_set) for msg in new_elements: if msg.train == 1 or msg.train == 3: self.ham.add(msg) elif msg.train == 0 or msg.train == 2: self.spam.add(msg) return new_elements
def make_cluster(self): """Constructs the initial cluster of emails.""" # self.dist_list = [t for t in self.dist_list if t is not None] if self.size > len(self.dist_list): print "\nTruncating cluster size...\n" self.size = len(self.dist_list) if self.sort_first: if 'frequency' in self.opt: emails = [self.clustroid] # list of added emails for d, e in self.dist_list: # Remove the duplicate clustroid in self.dist_list if e.tag == self.clustroid.tag: self.dist_list.remove((d, e)) # self.working_set.remove(e) print "-> removed duplicate clustroid ", e.tag break current_size = 1 while current_size < self.size: nearest = self.dist_list[0][1] # get nearest email assert (nearest.tag != self.clustroid.tag), str( nearest.tag) + " " + str(self.clustroid.tag) emails.append(nearest) # add to list self.added.append( nearest) # track order in which emails are added # self.working_set.remove(nearest) # remove from working set so email doesn't show up again when we recreate dist_list self.cluster_word_frequency = helpers.update_word_frequencies( self.cluster_word_frequency, nearest) # update word frequencies del self.dist_list[0] # so we don't add the email twice self.update_dist_list( ) # new cluster_word_frequency, so need to resort closest emails # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list current_size += 1 print "-> cluster initialized with size", len(emails) return set(emails) else: return set(item[1] for item in self.dist_list[:self.size]) else: k_smallest = quickselect.k_smallest return set(item[1] for item in k_smallest(self.dist_list, self.size))
def make_cluster(self): """Constructs the initial cluster of emails.""" # self.dist_list = [t for t in self.dist_list if t is not None] if self.size > len(self.dist_list): print "\nTruncating cluster size...\n" self.size = len(self.dist_list) if self.sort_first: if 'frequency' in self.opt: emails = [self.clustroid] # list of added emails for d,e in self.dist_list: # Remove the duplicate clustroid in self.dist_list if e.tag == self.clustroid.tag: self.dist_list.remove((d,e)) # self.working_set.remove(e) print "-> removed duplicate clustroid ", e.tag break current_size = 1 while current_size < self.size: nearest = self.dist_list[0][1] # get nearest email assert(nearest.tag != self.clustroid.tag), str(nearest.tag) + " " + str(self.clustroid.tag) emails.append(nearest) # add to list self.added.append(nearest) # track order in which emails are added # self.working_set.remove(nearest) # remove from working set so email doesn't show up again when we recreate dist_list self.cluster_word_frequency = helpers.update_word_frequencies(self.cluster_word_frequency, nearest) # update word frequencies del self.dist_list[0] # so we don't add the email twice self.update_dist_list() # new cluster_word_frequency, so need to resort closest emails # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list current_size += 1 print "-> cluster initialized with size", len(emails) return set(emails) else: return set(item[1] for item in self.dist_list[:self.size]) else: k_smallest = quickselect.k_smallest return set(item[1] for item in k_smallest(self.dist_list, self.size))
def cluster_more(self, n): """Expands the cluster to include n more emails and returns these additional emails. If n more is not available, cluster size is simply truncated to include all remaining emails.""" if 'frequency' in self.opt: if n >= len(self.dist_list): n = len(self.dist_list) print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt, " method" self.size += n new_elements = [] added = 0 while added < n: nearest = self.dist_list[0][1] # get nearest email new_elements.append(nearest) # add to new list self.added.append(nearest) self.cluster_set.add(nearest) # add to original cluster set self.cluster_word_frequency = helpers.update_word_frequencies( self.cluster_word_frequency, nearest) # update word frequencies # self.dist_list = self.distance_array(self.separate) # update distance list w/ new frequency list del self.dist_list[0] self.update_dist_list() added += 1 assert ( len(new_elements) == n), str(len(new_elements)) + " " + str(n) assert (len(self.cluster_set) == self.size), str( len(self.cluster_set)) + " " + str(self.size) for msg in new_elements: if msg.train == 1 or msg.train == 3: self.ham.add(msg) elif msg.train == 0 or msg.train == 2: self.spam.add(msg) return new_elements old_cluster_set = self.cluster_set if self.size + n <= len(self.dist_list): self.size += n else: print "\nTruncating cluster size...\n" if len(self.dist_list) > 0: self.size = len(self.dist_list) if self.sort_first: new_cluster_set = set(item[1] for item in self.dist_list[:self.size]) else: k_smallest = quickselect.k_smallest new_cluster_set = set( item[1] for item in k_smallest(self.dist_list, self.size)) new_elements = list(item for item in new_cluster_set if item not in old_cluster_set) self.cluster_set = new_cluster_set assert (len(self.cluster_set) == self.size), len(self.cluster_set) for msg in new_elements: if msg.train == 1 or msg.train == 3: self.ham.add(msg) elif msg.train == 0 or msg.train == 2: self.spam.add(msg) return new_elements