def test_learn_hello_world_tree_larger(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = list( self.hello_world_and_world_adjective.generate_all_string()) template_tree = learner.learn(dataset) print(template_tree_visualiser.render_tree_string(template_tree)) pruned_template_tree = template_tree.prune_redundant_abstractions() print( "pruned\n", template_tree_visualiser.render_tree_string(pruned_template_tree), ) # Only two templates in the top top_templates = { tt.get_template() for tt in pruned_template_tree.get_children() } self.assertEqual( { Template.from_string("The [SLOT] is [SLOT]"), Template.from_string("[SLOT], [SLOT]!"), }, top_templates, ) self.assertEqual( set(dataset), set({ t.get_template().to_flat_string() for t in pruned_template_tree.get_descendant_leaves() }), )
def test_3_line_learner(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = ["hello world", "hi world", "hello universe"] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("[SLOT] world"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hi world"] ], ), TemplateTree( Template.from_string("hello [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hello universe"] ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_disallow_empty_string_simple_2(self): dataset = [ "He likes cute cats", "He likes nice cats", "He likes cats", "This is another sentence", ] learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT] cats"), [ TemplateTree(Template.from_string(s)) for s in ["He likes cute cats", "He likes nice cats"] ], ), TemplateTree(Template.from_string("He likes cats")), ], ), TemplateTree(Template.from_string("This is another sentence")), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_learn_hello_world_tree(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = list(self.hello_world_small.generate_all_string()) template_tree = learner.learn(dataset) print( template_tree_visualiser.render_tree_string( template_tree.collapse()))
def induce_grammar_using_template_trees( lines: Collection[str], relative_similarity_threshold: float = 1, minimal_variables: bool = True, words_per_slot: int = 1, prune_redundant: bool = True, max_recalculation: Optional[int] = None, use_best_merge_candidate=True, max_depth: Optional[int] = None, ): # Learn a tree from the given dataset learned_tree = TemplateLatticeLearner( minimal_variables=minimal_variables, words_per_leaf_slot=words_per_slot, use_best_merge_candidate=use_best_merge_candidate, ).learn(lines) # Prune all redundant children: if all other children of parent cover it, the child is not necessary. if prune_redundant: learned_tree = learned_tree.prune_redundant_abstractions() derived_slot_values, simplified_tree = _name_and_simplify_tree( learned_tree, relative_similarity_threshold) simplified_tree = simplified_tree.collapse_using_slot_values( derived_slot_values) # Keep recalculating the tree until convergence new_tt = None iteration = 0 while simplified_tree != new_tt and (max_recalculation is None or iteration < max_recalculation): if new_tt is not None: simplified_tree = new_tt new_tt = simplified_tree.recalculate_templates( minimal_variables=minimal_variables) derived_slot_values, new_tt = _name_and_simplify_tree( new_tt, relative_similarity_threshold) iteration += 1 # Collapse final tree using the last slot values collapsed_tt = simplified_tree.collapse_using_slot_values( derived_slot_values) # Limit max depth if max_depth is not None: collapsed_tt = collapsed_tt.reduce_depth(max_depth) # Derive final slot values final_slot_values = collapsed_tt.get_slot_values() # Create grammar grammar = ContextFreeGrammar.from_slot_values( collapsed_tt.get_template(), final_slot_values, ) return grammar
def test_min_empty_sequence_disallow_empty_longer_2(self): learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_1 = Template.from_string("x y z a a b c") template_2 = Template.from_string("x y z b c d") merge = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual(Template.from_string("x y z [SLOT]"), merge.get_merged_template())
def test_2_line_learner(self): learner = TemplateLatticeLearner(minimal_variables=True) dataset = ["hello world", "hi world"] template_tree = learner.learn(dataset) expected_top_template = Template.from_string("[SLOT] world") expected = TemplateTree( expected_top_template, [TemplateTree(Template.from_string(s)) for s in dataset], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected_top_template, template_tree.get_template()) self.assertEqual(expected, template_tree)
def test_4_lines_initial_pairs(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = [ "hello world", "hi world", "hello solar system", "hi solar system" ] learner_state = LearnerState(_to_templates(dataset)) initial_pairs = learner._create_initial_merge_candidates( learner_state.get_active_templates()) print(len(initial_pairs)) for initial_pair in initial_pairs: print(initial_pair, "-->", initial_pair.get_merged_template())
def test_min_empty_sequence_longer(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2, allow_empty_string=True) template_1 = Template.from_string( "who sang i want to be with you everywhere") template_2 = Template.from_string( "who sang i only want to be with you") merge = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string( "who sang i [SLOT] want to be with you [SLOT]"), merge.get_merged_template())
def test_disallow_empty_string_hard(self): dataset = [ "I saw her on the quiet hill", "I saw her on the tall hill", "I saw her on the hill", "He likes cute cats", "He likes nice cats", "He likes cats", ] learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT]"), [ TemplateTree( Template.from_string("He likes [SLOT] cats"), [ TemplateTree(Template.from_string(s)) for s in ["He likes cute cats", "He likes nice cats"] ], ), TemplateTree(Template.from_string("He likes cats")), ], ), TemplateTree( Template.from_string("I saw her on the [SLOT]"), [ TemplateTree( Template.from_string( "I saw her on the [SLOT] hill"), [ TemplateTree(Template.from_string(s)) for s in [ "I saw her on the tall hill", "I saw her on the quiet hill", ] ], ), TemplateTree( Template.from_string("I saw her on the hill")), ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_get_best_merge_candidate_hello_world(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) template_1 = Template.from_string("hello world") template_2 = Template.from_string("hi solar system") merge_1_2 = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string("[SLOT]"), merge_1_2.get_merged_template(minimal_variables=True), ) self.assertEqual( 4, merge_1_2.get_distance(), )
def test_4_line_learner_longer_second(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = [ "hello world", "hi world", "hello solar system", "hi solar system" ] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("[SLOT]"), [ TemplateTree( Template.from_string("[SLOT] world"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hi world"] ], ), TemplateTree( Template.from_string("[SLOT] solar system"), [ TemplateTree(Template.from_string(s)) for s in ["hello solar system", "hi solar system"] ], ), TemplateTree( Template.from_string("hello [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hello world", "hello solar system"] ], ), TemplateTree( Template.from_string("hi [SLOT]"), [ TemplateTree(Template.from_string(s)) for s in ["hi world", "hi solar system"] ], ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_disallow_empty_string_simple(self): """ Checks whether disallowing empty string in learning works""" learner = TemplateLatticeLearner(minimal_variables=True, allow_empty_string=False) dataset = ["I am a human", "I am a nice human", "I am a bad human"] template_tree = learner.learn(dataset) expected = TemplateTree( Template.from_string("I am a [SLOT]"), [ TemplateTree( Template.from_string("I am a [SLOT] human"), [ TemplateTree(Template.from_string(s)) for s in ["I am a nice human", "I am a bad human"] ], ), TemplateTree(Template.from_string("I am a human"), ), ], ) print(template_tree_visualiser.render_tree_string(template_tree)) self.assertEqual(expected, template_tree)
def test_get_best_merge_candidate(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) template_1 = Template.from_string("The solar system is [SLOT]") template_1_point = Template.from_string("The solar system is [SLOT].") template_2 = Template.from_string("[SLOT], solar system!") template_3 = Template.from_string("The earth is [SLOT]") template_3_point = Template.from_string("The earth is [SLOT].") merge_1_2 = learner._get_best_merge_candidate(template_1, template_2) self.assertEqual( Template.from_string("[SLOT] solar system [SLOT]"), merge_1_2.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_2.get_distance(), ) merge_1_3 = learner._get_best_merge_candidate(template_1, template_3) self.assertEqual( Template.from_string("The [SLOT] is [SLOT]"), merge_1_3.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_3.get_distance(), ) # With punctuation version merge_1_2p = learner._get_best_merge_candidate(template_1_point, template_2) self.assertEqual( Template.from_string("[SLOT] solar system [SLOT]"), merge_1_2p.get_merged_template(minimal_variables=True), ) self.assertEqual( 4, merge_1_2p.get_distance(), ) merge_1_3p = learner._get_best_merge_candidate(template_1_point, template_3_point) self.assertEqual( Template.from_string("The [SLOT] is [SLOT]."), merge_1_3p.get_merged_template(minimal_variables=True), ) self.assertEqual( 3, merge_1_3p.get_distance(), )
def test_4_line_learner_longer_second_initial_pairs_always_same(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = [ "hello world", "hi world", "hello solar system", "hi solar system" ] def get_initial_pairs(): random.shuffle(dataset) learner_state = LearnerState(_to_templates(dataset)) initial_pairs = learner._create_initial_merge_candidates( learner_state.get_active_templates()) return {p for p in initial_pairs if p.get_distance() <= 2} first_pairs = get_initial_pairs() for i in range(100): other_pairs = get_initial_pairs() self.assertEqual(len(first_pairs), len(other_pairs))
def test_same_tree_induction_larger(self): learner = TemplateLatticeLearner(minimal_variables=True, words_per_leaf_slot=2) dataset = list( self.hello_world_and_world_adjective.generate_all_string()) self.check_same_tree_learned(learner, dataset)