Example #1
0
    def test_learn_hello_world_tree_larger(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        dataset = list(
            self.hello_world_and_world_adjective.generate_all_string())
        template_tree = learner.learn(dataset)
        print(template_tree_visualiser.render_tree_string(template_tree))

        pruned_template_tree = template_tree.prune_redundant_abstractions()
        print(
            "pruned\n",
            template_tree_visualiser.render_tree_string(pruned_template_tree),
        )

        # Only two templates in the top
        top_templates = {
            tt.get_template()
            for tt in pruned_template_tree.get_children()
        }
        self.assertEqual(
            {
                Template.from_string("The [SLOT] is [SLOT]"),
                Template.from_string("[SLOT], [SLOT]!"),
            },
            top_templates,
        )
        self.assertEqual(
            set(dataset),
            set({
                t.get_template().to_flat_string()
                for t in pruned_template_tree.get_descendant_leaves()
            }),
        )
Example #2
0
    def test_3_line_learner(self):
        learner = TemplateLatticeLearner(minimal_variables=True)
        dataset = ["hello world", "hi world", "hello universe"]
        template_tree = learner.learn(dataset)

        expected = TemplateTree(
            Template.from_string("[SLOT]"),
            [
                TemplateTree(
                    Template.from_string("[SLOT] world"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello world", "hi world"]
                    ],
                ),
                TemplateTree(
                    Template.from_string("hello [SLOT]"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello world", "hello universe"]
                    ],
                ),
            ],
        )
        print(template_tree_visualiser.render_tree_string(template_tree))
        self.assertEqual(expected, template_tree)
Example #3
0
 def test_disallow_empty_string_simple_2(self):
     dataset = [
         "He likes cute cats",
         "He likes nice cats",
         "He likes cats",
         "This is another sentence",
     ]
     learner = TemplateLatticeLearner(minimal_variables=True,
                                      allow_empty_string=False)
     template_tree = learner.learn(dataset)
     expected = TemplateTree(
         Template.from_string("[SLOT]"),
         [
             TemplateTree(
                 Template.from_string("He likes [SLOT]"),
                 [
                     TemplateTree(
                         Template.from_string("He likes [SLOT] cats"),
                         [
                             TemplateTree(Template.from_string(s)) for s in
                             ["He likes cute cats", "He likes nice cats"]
                         ],
                     ),
                     TemplateTree(Template.from_string("He likes cats")),
                 ],
             ),
             TemplateTree(Template.from_string("This is another sentence")),
         ],
     )
     print(template_tree_visualiser.render_tree_string(template_tree))
     self.assertEqual(expected, template_tree)
Example #4
0
 def test_learn_hello_world_tree(self):
     learner = TemplateLatticeLearner(minimal_variables=True)
     dataset = list(self.hello_world_small.generate_all_string())
     template_tree = learner.learn(dataset)
     print(
         template_tree_visualiser.render_tree_string(
             template_tree.collapse()))
Example #5
0
def induce_grammar_using_template_trees(
    lines: Collection[str],
    relative_similarity_threshold: float = 1,
    minimal_variables: bool = True,
    words_per_slot: int = 1,
    prune_redundant: bool = True,
    max_recalculation: Optional[int] = None,
    use_best_merge_candidate=True,
    max_depth: Optional[int] = None,
):
    # Learn a tree from the given dataset
    learned_tree = TemplateLatticeLearner(
        minimal_variables=minimal_variables,
        words_per_leaf_slot=words_per_slot,
        use_best_merge_candidate=use_best_merge_candidate,
    ).learn(lines)

    # Prune all redundant children: if all other children of parent cover it, the child is not necessary.
    if prune_redundant:
        learned_tree = learned_tree.prune_redundant_abstractions()

    derived_slot_values, simplified_tree = _name_and_simplify_tree(
        learned_tree, relative_similarity_threshold)

    simplified_tree = simplified_tree.collapse_using_slot_values(
        derived_slot_values)

    # Keep recalculating the tree until convergence
    new_tt = None
    iteration = 0
    while simplified_tree != new_tt and (max_recalculation is None
                                         or iteration < max_recalculation):
        if new_tt is not None:
            simplified_tree = new_tt
        new_tt = simplified_tree.recalculate_templates(
            minimal_variables=minimal_variables)
        derived_slot_values, new_tt = _name_and_simplify_tree(
            new_tt, relative_similarity_threshold)
        iteration += 1

    # Collapse final tree using the last slot values
    collapsed_tt = simplified_tree.collapse_using_slot_values(
        derived_slot_values)

    # Limit max depth
    if max_depth is not None:
        collapsed_tt = collapsed_tt.reduce_depth(max_depth)

    # Derive final slot values
    final_slot_values = collapsed_tt.get_slot_values()

    # Create grammar
    grammar = ContextFreeGrammar.from_slot_values(
        collapsed_tt.get_template(),
        final_slot_values,
    )

    return grammar
Example #6
0
    def test_min_empty_sequence_disallow_empty_longer_2(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         allow_empty_string=False)
        template_1 = Template.from_string("x y z a a b c")
        template_2 = Template.from_string("x y z b c d")
        merge = learner._get_best_merge_candidate(template_1, template_2)

        self.assertEqual(Template.from_string("x y z [SLOT]"),
                         merge.get_merged_template())
Example #7
0
    def test_2_line_learner(self):
        learner = TemplateLatticeLearner(minimal_variables=True)
        dataset = ["hello world", "hi world"]
        template_tree = learner.learn(dataset)

        expected_top_template = Template.from_string("[SLOT] world")
        expected = TemplateTree(
            expected_top_template,
            [TemplateTree(Template.from_string(s)) for s in dataset],
        )
        print(template_tree_visualiser.render_tree_string(template_tree))
        self.assertEqual(expected_top_template, template_tree.get_template())
        self.assertEqual(expected, template_tree)
Example #8
0
    def test_4_lines_initial_pairs(self):

        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        dataset = [
            "hello world", "hi world", "hello solar system", "hi solar system"
        ]
        learner_state = LearnerState(_to_templates(dataset))
        initial_pairs = learner._create_initial_merge_candidates(
            learner_state.get_active_templates())
        print(len(initial_pairs))
        for initial_pair in initial_pairs:
            print(initial_pair, "-->", initial_pair.get_merged_template())
Example #9
0
    def test_min_empty_sequence_longer(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2,
                                         allow_empty_string=True)
        template_1 = Template.from_string(
            "who sang i want to be with you everywhere")
        template_2 = Template.from_string(
            "who sang i only want to be with you")

        merge = learner._get_best_merge_candidate(template_1, template_2)
        self.assertEqual(
            Template.from_string(
                "who sang i [SLOT] want to be with you [SLOT]"),
            merge.get_merged_template())
Example #10
0
 def test_disallow_empty_string_hard(self):
     dataset = [
         "I saw her on the quiet hill",
         "I saw her on the tall hill",
         "I saw her on the hill",
         "He likes cute cats",
         "He likes nice cats",
         "He likes cats",
     ]
     learner = TemplateLatticeLearner(minimal_variables=True,
                                      allow_empty_string=False)
     template_tree = learner.learn(dataset)
     expected = TemplateTree(
         Template.from_string("[SLOT]"),
         [
             TemplateTree(
                 Template.from_string("He likes [SLOT]"),
                 [
                     TemplateTree(
                         Template.from_string("He likes [SLOT] cats"),
                         [
                             TemplateTree(Template.from_string(s)) for s in
                             ["He likes cute cats", "He likes nice cats"]
                         ],
                     ),
                     TemplateTree(Template.from_string("He likes cats")),
                 ],
             ),
             TemplateTree(
                 Template.from_string("I saw her on the [SLOT]"),
                 [
                     TemplateTree(
                         Template.from_string(
                             "I saw her on the [SLOT] hill"),
                         [
                             TemplateTree(Template.from_string(s))
                             for s in [
                                 "I saw her on the tall hill",
                                 "I saw her on the quiet hill",
                             ]
                         ],
                     ),
                     TemplateTree(
                         Template.from_string("I saw her on the hill")),
                 ],
             ),
         ],
     )
     print(template_tree_visualiser.render_tree_string(template_tree))
     self.assertEqual(expected, template_tree)
Example #11
0
    def test_get_best_merge_candidate_hello_world(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        template_1 = Template.from_string("hello world")
        template_2 = Template.from_string("hi solar system")

        merge_1_2 = learner._get_best_merge_candidate(template_1, template_2)
        self.assertEqual(
            Template.from_string("[SLOT]"),
            merge_1_2.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            4,
            merge_1_2.get_distance(),
        )
Example #12
0
    def test_4_line_learner_longer_second(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        dataset = [
            "hello world", "hi world", "hello solar system", "hi solar system"
        ]
        template_tree = learner.learn(dataset)

        expected = TemplateTree(
            Template.from_string("[SLOT]"),
            [
                TemplateTree(
                    Template.from_string("[SLOT] world"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello world", "hi world"]
                    ],
                ),
                TemplateTree(
                    Template.from_string("[SLOT] solar system"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello solar system", "hi solar system"]
                    ],
                ),
                TemplateTree(
                    Template.from_string("hello [SLOT]"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hello world", "hello solar system"]
                    ],
                ),
                TemplateTree(
                    Template.from_string("hi [SLOT]"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["hi world", "hi solar system"]
                    ],
                ),
            ],
        )
        print(template_tree_visualiser.render_tree_string(template_tree))
        self.assertEqual(expected, template_tree)
Example #13
0
    def test_disallow_empty_string_simple(self):
        """ Checks whether disallowing empty string in learning works"""
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         allow_empty_string=False)
        dataset = ["I am a human", "I am a nice human", "I am a bad human"]
        template_tree = learner.learn(dataset)

        expected = TemplateTree(
            Template.from_string("I am a [SLOT]"),
            [
                TemplateTree(
                    Template.from_string("I am a [SLOT] human"),
                    [
                        TemplateTree(Template.from_string(s))
                        for s in ["I am a nice human", "I am a bad human"]
                    ],
                ),
                TemplateTree(Template.from_string("I am a human"), ),
            ],
        )
        print(template_tree_visualiser.render_tree_string(template_tree))
        self.assertEqual(expected, template_tree)
Example #14
0
    def test_get_best_merge_candidate(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        template_1 = Template.from_string("The solar system is [SLOT]")
        template_1_point = Template.from_string("The solar system is [SLOT].")
        template_2 = Template.from_string("[SLOT], solar system!")

        template_3 = Template.from_string("The earth is [SLOT]")
        template_3_point = Template.from_string("The earth is [SLOT].")

        merge_1_2 = learner._get_best_merge_candidate(template_1, template_2)
        self.assertEqual(
            Template.from_string("[SLOT] solar system [SLOT]"),
            merge_1_2.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            3,
            merge_1_2.get_distance(),
        )

        merge_1_3 = learner._get_best_merge_candidate(template_1, template_3)
        self.assertEqual(
            Template.from_string("The [SLOT] is [SLOT]"),
            merge_1_3.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            3,
            merge_1_3.get_distance(),
        )

        # With punctuation version
        merge_1_2p = learner._get_best_merge_candidate(template_1_point,
                                                       template_2)
        self.assertEqual(
            Template.from_string("[SLOT] solar system [SLOT]"),
            merge_1_2p.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            4,
            merge_1_2p.get_distance(),
        )

        merge_1_3p = learner._get_best_merge_candidate(template_1_point,
                                                       template_3_point)
        self.assertEqual(
            Template.from_string("The [SLOT] is [SLOT]."),
            merge_1_3p.get_merged_template(minimal_variables=True),
        )
        self.assertEqual(
            3,
            merge_1_3p.get_distance(),
        )
Example #15
0
    def test_4_line_learner_longer_second_initial_pairs_always_same(self):
        learner = TemplateLatticeLearner(minimal_variables=True,
                                         words_per_leaf_slot=2)
        dataset = [
            "hello world", "hi world", "hello solar system", "hi solar system"
        ]

        def get_initial_pairs():
            random.shuffle(dataset)
            learner_state = LearnerState(_to_templates(dataset))
            initial_pairs = learner._create_initial_merge_candidates(
                learner_state.get_active_templates())
            return {p for p in initial_pairs if p.get_distance() <= 2}

        first_pairs = get_initial_pairs()
        for i in range(100):
            other_pairs = get_initial_pairs()
            self.assertEqual(len(first_pairs), len(other_pairs))
Example #16
0
 def test_same_tree_induction_larger(self):
     learner = TemplateLatticeLearner(minimal_variables=True,
                                      words_per_leaf_slot=2)
     dataset = list(
         self.hello_world_and_world_adjective.generate_all_string())
     self.check_same_tree_learned(learner, dataset)