def cluster_tuples(self, matched_tuples): # this is a single-pass clustering # Initialize: if no patterns exist, first tuple goes to first cluster if len(self.patterns) == 0: c1 = Pattern(matched_tuples[0]) self.patterns.append(c1) count = 0 for t in matched_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() max_similarity = 0 max_similarity_cluster_index = 0 # go through all patterns(clusters of tuples) and find the one # with the highest similarity score for i in range(0, len(self.patterns), 1): extraction_pattern = self.patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster having # this tuple as the centroid if max_similarity < self.config.threshold_similarity: c = Pattern(t) self.patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: self.patterns[max_similarity_cluster_index].add_tuple(t)
def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn): updated_patterns = list(patterns) count = 0 for t in matched_tuples: count += 1 if count % 500 == 0: print multiprocessing.current_process( ), count, "tuples processed" # Go through all patterns(clusters of tuples) and find the one with the highest similarity score max_similarity = 0 max_similarity_cluster_index = 0 for i in range(0, len(updated_patterns)): extraction_pattern = updated_patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster if max_similarity < self.config.threshold_similarity: c = Pattern(t) updated_patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with the highest similarity else: updated_patterns[max_similarity_cluster_index].add_tuple(t) # Eliminate clusters with two or less patterns new_patterns = [p for p in updated_patterns if len(p.tuples) > 5] pid = multiprocessing.current_process().pid print multiprocessing.current_process(), "Patterns: ", len( new_patterns) child_conn.send((pid, new_patterns))