def cluster_more(self, n): """Expands the cluster to include n more emails and returns these additional emails. If n more is not available, cluster size is simply truncated to include all remaining emails.""" if 'frequency' in self.opt: if n >= len(self.dist_list): n = len(self.dist_list) print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt, " method" self.size += n new_elements = [] added = 0 while added < n: d,i = self.dist_list.pop(0) # get nearest email new_elements.append(i) # add to new list self.added.append(i) self.cluster_set.add(i) # add to original cluster set self.cluster_word_frequency = h.update_word_frequencies(self.cluster_word_frequency, self.data_x[i]) # update word frequencies self.update_dist_list() added += 1 if added % 10 == 0: print added, "/", n assert(len(new_elements) == n), str(len(new_elements)) + " " + str(n) assert(len(self.cluster_set) == self.size), str(len(self.cluster_set)) + " " + str(self.size) self.divide(new_elements) return new_elements
def make_cluster(self): """Constructs the initial cluster of emails.""" # self.dist_list = [t for t in self.dist_list if t is not None] if self.size > len(self.dist_list): print "\nTruncating cluster size...\n" self.size = len(self.dist_list) if 'frequency' in self.opt: emails = [self.clustroid] # list of added emails current_size = 1 while current_size < self.size: d, i = self.dist_list.pop(0) # get nearest email emails.append(i) # add to list self.added.append(i) # track order in which emails are added self.cluster_word_frequency = h.update_word_frequencies( self.cluster_word_frequency, self.data_x[i]) # update word frequencies self.update_dist_list() if current_size % 10 == 0: print current_size, "/", self.size # new cluster_word_frequency, so need to resort closest emails current_size += 1 print "-> cluster initialized with size", len(emails) return set(emails)
def make_cluster(self): """Constructs the initial cluster of emails.""" # self.dist_list = [t for t in self.dist_list if t is not None] if self.size > len(self.dist_list): print "\nTruncating cluster size...\n" self.size = len(self.dist_list) if 'frequency' in self.opt: emails = [self.clustroid] # list of added emails current_size = 1 while current_size < self.size: d,i = self.dist_list.pop(0) # get nearest email emails.append(i) # add to list self.added.append(i) # track order in which emails are added self.cluster_word_frequency = h.update_word_frequencies(self.cluster_word_frequency, self.data_x[i]) # update word frequencies self.update_dist_list() if current_size % 10 == 0: print current_size, "/", self.size # new cluster_word_frequency, so need to resort closest emails current_size += 1 print "-> cluster initialized with size", len(emails) return set(emails)
def cluster_more(self, n): """Expands the cluster to include n more emails and returns these additional emails. If n more is not available, cluster size is simply truncated to include all remaining emails.""" if 'frequency' in self.opt: if n >= len(self.dist_list): n = len(self.dist_list) print "Adding ", n, " more emails to cluster of size ", self.size, " via ", self.opt, " method" self.size += n new_elements = [] added = 0 while added < n: d, i = self.dist_list.pop(0) # get nearest email new_elements.append(i) # add to new list self.added.append(i) self.cluster_set.add(i) # add to original cluster set self.cluster_word_frequency = h.update_word_frequencies( self.cluster_word_frequency, self.data_x[i]) # update word frequencies self.update_dist_list() added += 1 if added % 10 == 0: print added, "/", n assert ( len(new_elements) == n), str(len(new_elements)) + " " + str(n) assert (len(self.cluster_set) == self.size), str( len(self.cluster_set)) + " " + str(self.size) self.divide(new_elements) return new_elements