def cluster_tuples(self, matched_tuples): # this is a single-pass clustering # Initialize: if no patterns exist, first tuple goes to first cluster if len(self.patterns) == 0: c1 = Pattern(matched_tuples[0]) self.patterns.append(c1) count = 0 for t in matched_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() max_similarity = 0 max_similarity_cluster_index = 0 # go through all patterns(clusters of tuples) and find the one # with the highest similarity score for i in range(0, len(self.patterns), 1): extraction_pattern = self.patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster having # this tuple as the centroid if max_similarity < self.config.threshold_similarity: c = Pattern(t) self.patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: self.patterns[max_similarity_cluster_index].add_tuple(t)
def test_update_selectivity(self): bef_words = ['dummy'] bet_words = ['dummy'] aft_words = ['dummy'] # positive pattern = Pattern() t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config, self.baseline) self.assertEqual(len(pattern.p_values), 1) self.assertEqual(pattern.p_values[0], self.baseline.shortest_path(self.e1, self.e2))
def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn): updated_patterns = list(patterns) count = 0 for t in matched_tuples: count += 1 if count % 500 == 0: print(multiprocessing.current_process(), count, \ "tuples processed") # go through all patterns(clusters of tuples) and find the one with # the highest similarity score max_similarity = 0 max_similarity_cluster_index = 0 for i in range(0, len(updated_patterns)): extraction_pattern = updated_patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster if max_similarity < self.config.threshold_similarity: c = Pattern(t) updated_patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: updated_patterns[max_similarity_cluster_index].add_tuple(t) # Eliminate clusters with two or less patterns new_patterns = [p for p in updated_patterns if len(p.tuples) > 5] pid = multiprocessing.current_process().pid print(multiprocessing.current_process(), "Patterns: ", len(new_patterns)) child_conn.send((pid, new_patterns))
def test_update_selectivity(self): bef_words = ['dummy'] bet_words = ['dummy'] aft_words = ['dummy'] # positive pattern = Pattern() t = Tuple('seed_1 ', 'seed_2 ', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.positive, 1) self.assertEqual(pattern.negative, 0) self.assertEqual(pattern.unknown, 0) # negative pattern = Pattern() t = Tuple('seed_1', 'seed_5', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.negative, 1) self.assertEqual(pattern.positive, 0) self.assertEqual(pattern.unknown, 0) # negative pattern = Pattern() t = Tuple('seed_1', 'seed_3', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.unknown, 0) self.assertEqual(pattern.positive, 0) self.assertEqual(pattern.negative, 1) # unknown pattern = Pattern() t = Tuple('seed_4', 'seed_5', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.negative, 0) self.assertEqual(pattern.positive, 0) self.assertEqual(pattern.unknown, 1)
def test_update_confidence(self): bef_words = ['dummy'] bet_words = ['dummy'] aft_words = ['dummy'] # positive pattern = Pattern() t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config, self.baseline) pattern.update_confidence(self.config) print(pattern.p_values[0]) self.assertGreater(pattern.confidence, .5) # negative pattern = Pattern() t = Tuple(self.e2, self.e1, None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config, self.baseline) pattern.update_confidence(self.config) self.assertLess(pattern.confidence, .5)