Exemple #1
0
 def read_stage_file(self):
     ScoringExperiment.read_stage_file(self)
     if "terminal_labeling" in self.stage_dict:
         terminal_labeling_path = self.stage_dict["terminal_labeling"]
         with open(terminal_labeling_path, "r") as tlf:
             self.terminal_labeling = deserialize_labeling(json.load(tlf))
             self.induction_settings.terminal_labeling = self.terminal_labeling
    def test_suffix_labeling(self):
        file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
        corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file,
                                                  disconnect_punctuation=False)

        labeling = tl.Suffix(trees=corpus, threshold=2)

        label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')),
                 labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')),
                 labeling.token_label(mt.ConstituentTerminal('stronghold', 'FM')),
                 labeling.token_label(mt.ConstituentTerminal('den', 'ART'))
                 ]

        serialization = labeling.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        label2 = [
                 labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')),
                 labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')),
                 labeling.token_label(mt.ConstituentTerminal('stronghold', 'FM')),
                 labeling.token_label(mt.ConstituentTerminal('den', 'ART'))]
        print(label)
        self.assertEqual(label, label2)
        self.assertTrue(isinstance(instance2, labeling.__class__))
    def test_fallback_labeling(self):
        file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
        corpus = np.sentence_names_to_hybridtrees(
            [str(x) for x in range(50) if x % 10 > 1],
            file,
            disconnect_punctuation=False)
        labeling = tl.FrequencyBiasedTerminalLabeling(tl.FormTerminals(),
                                                      tl.PosTerminals(),
                                                      corpus=corpus,
                                                      threshold=2)
        print(labeling.fine_label_count)

        token1 = mt.ConstituentTerminal('Milliardär', 'NN')
        token2 = mt.ConstituentTerminal('Tisch', 'NN')
        label1 = labeling.token_label(token1)
        label2 = labeling.token_label(token2)

        f = io.StringIO()
        json.dump(labeling.serialize(), f)
        f.seek(0)
        print(f.getvalue())

        instance2 = tl.deserialize_labeling(json.load(f))

        self.assertTrue(isinstance(instance2, labeling.__class__))
        self.assertEqual(label1, instance2.token_label(token1))
        self.assertEqual(label2, instance2.token_label(token2))
    def test_unk4_labeling(self):
        file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
        corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file,
                                                      disconnect_punctuation=False)

        labeling = tl.UNK4(trees=corpus, threshold=2, use_pos=False)

        label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]

        serialization = labeling.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        label2 = [instance2.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]
        print(label)
        self.assertEqual(label, label2)
        self.assertTrue(isinstance(instance2, labeling.__class__))

        labeling = tl.UNK4(trees=corpus, threshold=2, use_pos=True)

        label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]

        serialization = labeling.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        label2 = [instance2.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]
        print(label)
        self.assertEqual(label, label2)
        self.assertTrue(isinstance(instance2, labeling.__class__))
    def test_simple_labelings(self):
        for labeling_class in [tl.PosTerminals, tl.FormTerminals, tl.CPosTerminals]:
            instance = labeling_class()
            serialization = instance.serialize()
            print(serialization)

            instance2 = tl.deserialize_labeling(serialization)

            self.assertTrue(isinstance(instance2, labeling_class))
    def test_composition_labeling(self):
        complex = tl.CompositionalTerminalLabeling(tl.FormTerminals(), tl.PosTerminals(), binding_string='/')
        token = mt.ConstituentTerminal('Tisch', 'NN')
        label = complex.token_label(token)
        print(token, label)


        serialization = complex.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        self.assertTrue(isinstance(instance2, complex.__class__))
        self.assertEqual(label, instance2.token_label(token))