def test_fallback_labeling(self):
        file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
        corpus = np.sentence_names_to_hybridtrees(
            [str(x) for x in range(50) if x % 10 > 1],
            file,
            disconnect_punctuation=False)
        labeling = tl.FrequencyBiasedTerminalLabeling(tl.FormTerminals(),
                                                      tl.PosTerminals(),
                                                      corpus=corpus,
                                                      threshold=2)
        print(labeling.fine_label_count)

        token1 = mt.ConstituentTerminal('Milliardär', 'NN')
        token2 = mt.ConstituentTerminal('Tisch', 'NN')
        label1 = labeling.token_label(token1)
        label2 = labeling.token_label(token2)

        f = io.StringIO()
        json.dump(labeling.serialize(), f)
        f.seek(0)
        print(f.getvalue())

        instance2 = tl.deserialize_labeling(json.load(f))

        self.assertTrue(isinstance(instance2, labeling.__class__))
        self.assertEqual(label1, instance2.token_label(token1))
        self.assertEqual(label2, instance2.token_label(token2))
Beispiel #2
0
 def read_corpus_export(self, resource, mode="STANDARD", skip_normalization=False):
     """
     :type resource: CorpusFile
     :param mode: either STANDARD or DISCO-DOP (handles variation in NEGRA format)
     :type mode: str
     :param skip_normalization: If normalization is skipped even if set in induction settings.
     :type skip_normalization: bool
     :return: corpus of constituent trees
     """
     if resource.filter is None:
         def sentence_filter(_):
             return True
     else:
         sentence_filter = resource.filter
     path = resource.path
     if not skip_normalization and self.induction_settings.normalize:
         path = self.normalize_corpus(path, src='export', dest='export', renumber=False)
     # encoding = "iso-8859-1"
     encoding = "utf-8"
     return np.sentence_names_to_hybridtrees(
         {str(i) for i in range(resource.start, resource.end + 1)
          if i not in resource.exclude and sentence_filter(i)},
         path,
         enc=encoding, disconnect_punctuation=self.induction_settings.disconnect_punctuation, add_vroot=True,
         mode=mode)
    def test_suffix_labeling(self):
        file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
        corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file,
                                                  disconnect_punctuation=False)

        labeling = tl.Suffix(trees=corpus, threshold=2)

        label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')),
                 labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')),
                 labeling.token_label(mt.ConstituentTerminal('stronghold', 'FM')),
                 labeling.token_label(mt.ConstituentTerminal('den', 'ART'))
                 ]

        serialization = labeling.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        label2 = [
                 labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')),
                 labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')),
                 labeling.token_label(mt.ConstituentTerminal('stronghold', 'FM')),
                 labeling.token_label(mt.ConstituentTerminal('den', 'ART'))]
        print(label)
        self.assertEqual(label, label2)
        self.assertTrue(isinstance(instance2, labeling.__class__))
 def test_brown_cluster_labeling(self):
     clustering = "clustering/tiger_final.clustering"
     file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
     corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file,
                                               disconnect_punctuation=False)
     unk_strat = tl.UNKStrategySuffix(2)
     labeling = tl.BrownCluster(clustering=clustering, trees=corpus, unk_strategy=unk_strat, cluster_occurence_threshold=100)
     label = [labeling.token_label(mt.ConstituentTerminal('Auskunft', 'NN')),
              labeling.token_label(mt.ConstituentTerminal('1962', 'NN')),
              labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')),
              labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')),
              labeling.token_label(mt.ConstituentTerminal('um', 'FM')),
              labeling.token_label(mt.ConstituentTerminal('den', 'ART'))
              ]
     print(label)
    def test_unk4_labeling(self):
        file = "res/TIGER/tiger21/tigertraindev_root_attach.export"
        corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file,
                                                      disconnect_punctuation=False)

        labeling = tl.UNK4(trees=corpus, threshold=2, use_pos=False)

        label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]

        serialization = labeling.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        label2 = [instance2.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]
        print(label)
        self.assertEqual(label, label2)
        self.assertTrue(isinstance(instance2, labeling.__class__))

        labeling = tl.UNK4(trees=corpus, threshold=2, use_pos=True)

        label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                 labeling.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]

        serialization = labeling.serialize()
        print(serialization)

        instance2 = tl.deserialize_labeling(serialization)
        label2 = [instance2.token_label(mt.ConstituentTerminal('Tisch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('TISCH', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Ätsch', 'NN')),
                  instance2.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))]
        print(label)
        self.assertEqual(label, label2)
        self.assertTrue(isinstance(instance2, labeling.__class__))
def main():
    inpath = sys.argv[1]
    outpath = sys.argv[2]
    begin = int(sys.argv[3])
    end = int(sys.argv[4])
    print(inpath)
    sent_ids = [str(i) for i in range(begin, end + 1)]
    corpus = sentence_names_to_hybridtrees(sent_ids, inpath)
    map(lambda x: x.strip_vroot(), corpus)
    with codecs.open(outpath, mode='w', encoding="utf-8") as file:
        lines = serialize_hybridtrees_to_negra(corpus, begin, 2000)
        for line in lines:
            if not (isinstance(line, unicode) or isinstance(line, str)):
                print(line)
            try:
                file.write(line)
            except UnicodeEncodeError:
                print(line, type(line))
                raise
Beispiel #7
0
    def test_negra_to_dag_parsing(self):
        pass
        names = list(map(str, [26954]))

        fd_, primary_file = tempfile.mkstemp(suffix='.export')
        with open(primary_file, mode='w') as pf:

            for s in names:
                dsg = tp.sentence_names_to_deep_syntax_graphs(
                    [s],
                    "res/tiger/tiger_s%s.xml" % s,
                    hold=False,
                    ignore_puntcuation=False)[0]
                dsg.set_label(dsg.label[1:])
                lines = np.serialize_hybrid_dag_to_negra(
                    [dsg], 0, 500, use_sentence_names=True)
                print(''.join(lines), file=pf)

        _, binarized_file = tempfile.mkstemp(suffix='.export')
        subprocess.call([
            "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1",
            primary_file, binarized_file
        ])

        print(primary_file)
        print(binarized_file)

        corpus = np.sentence_names_to_hybridtrees(names,
                                                  primary_file,
                                                  secedge=True)
        corpus2 = np.sentence_names_to_hybridtrees(names,
                                                   binarized_file,
                                                   secedge=True)
        dag = corpus[0]
        print(dag)
        assert isinstance(dag, HybridDag)
        self.assertEqual(8, len(dag.token_yield()))
        for token in dag.token_yield():
            print(token.form() + '/' + token.pos(), end=' ')
        print()

        dag_bin = corpus2[0]
        print(dag_bin)

        for token in dag_bin.token_yield():
            print(token.form() + '/' + token.pos(), end=' ')
        print()
        self.assertEqual(8, len(dag_bin.token_yield()))

        for node, token in zip(
                dag_bin.nodes(),
                list(map(str, map(dag_bin.node_token, dag_bin.nodes())))):
            print(node, token)

        print()
        print(top(dag_bin, {'500', '101', '102'}))
        self.assertSetEqual({'101', '500'}, top(dag_bin,
                                                {'500', '101', '102'}))
        print(bottom(dag_bin, {'500', '101', '102'}))
        self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'}))
        grammar = direct_extract_lcfrs_from_prebinarized_corpus(dag_bin)
        print(grammar)

        parser = LCFRS_parser(grammar)

        poss = list(map(lambda x: x.pos(), dag_bin.token_yield()))
        print(poss)
        parser.set_input(poss)

        parser.parse()

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        print(der)

        dcp_term = DCP_evaluator(der).getEvaluation()

        print(dcp_term[0])

        dag_eval = HybridDag(dag_bin.sent_label())
        dcp_to_hybriddag(dag_eval,
                         dcp_term,
                         copy.deepcopy(dag_bin.token_yield()),
                         False,
                         construct_token=construct_constituent_token)

        print(dag_eval)
        for node in dag_eval.nodes():
            token = dag_eval.node_token(node)
            if token.type() == "CONSTITUENT-CATEGORY":
                label = token.category()
            elif token.type() == "CONSTITUENT-TERMINAL":
                label = token.form(), token.pos()

            print(node, label, dag_eval.children(node),
                  dag_eval.sec_children(node), dag_eval.sec_parents(node))

        lines = np.serialize_hybridtrees_to_negra([dag_eval],
                                                  1,
                                                  500,
                                                  use_sentence_names=True)
        for line in lines:
            print(line, end='')

        print()

        with open(primary_file) as pcf:
            for line in pcf:
                print(line, end='')
Beispiel #8
0
    def test_negra_dag_small_grammar(self):
        DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export'
        DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export'
        names = list([str(i) for i in range(1, 101)])
        if not os.path.exists(DAG_CORPUS):
            print(
                'run the following command to create an export corpus with dags:'
            )
            print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' +
                  'res/tiger/tiger_release_aug07.corrected.16012013.xml ' +
                  DAG_CORPUS + ' 1 50474')
        self.assertTrue(os.path.exists(DAG_CORPUS))

        if not os.path.exists(DAG_CORPUS_BIN):
            print(
                'run the following command to binarize the export corpus with dags:'
            )
            print("discodop treetransforms --binarize -v 1 -h 1 " +
                  DAG_CORPUS + " " + DAG_CORPUS_BIN)
            # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export')
            # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN])
        self.assertTrue(os.path.exists(DAG_CORPUS_BIN))
        corpus = np.sentence_names_to_hybridtrees(names,
                                                  DAG_CORPUS,
                                                  secedge=True)
        corpus_bin = np.sentence_names_to_hybridtrees(names,
                                                      DAG_CORPUS_BIN,
                                                      secedge=True)

        grammar = LCFRS(start="START")

        for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin):
            self.assertEqual(len(hybrid_dag.token_yield()),
                             len(hybrid_dag_bin.token_yield()))

            dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                hybrid_dag_bin)
            grammar.add_gram(dag_grammar)

        grammar.make_proper()
        print(
            "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules"
            % (len(grammar.nonts()), len(grammar.rules())))

        parser = DiscodopKbestParser(grammar, k=1)

        _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_',
                                          suffix='.export')

        with open(RESULT_FILE, 'w') as results:
            for hybrid_dag in corpus:

                poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield()))
                parser.set_input(poss)
                parser.parse()
                self.assertTrue(parser.recognized())
                der = parser.best_derivation_tree()

                dcp_term = DCP_evaluator(der).getEvaluation()
                dag_eval = HybridDag(hybrid_dag.sent_label())
                dcp_to_hybriddag(dag_eval,
                                 dcp_term,
                                 copy.deepcopy(hybrid_dag.token_yield()),
                                 False,
                                 construct_token=construct_constituent_token)
                lines = np.serialize_hybridtrees_to_negra(
                    [dag_eval], 1, 500, use_sentence_names=True)
                for line in lines:
                    print(line, end='', file=results)
                parser.clear()

        print("Wrote results to %s" % RESULT_FILE)
    def test_something(self):
        normal_corpus = 'res/tiger/tiger_8000.export'
        binarized_corpus = 'res/tiger/tiger_8000_bin.export'
        limit = 55000
        # limit = 30
        corpus_bin = sentence_names_to_hybridtrees(
            {str(x)
             for x in range(limit)},
            binarized_corpus,
            disconnect_punctuation=False,
            add_vroot=True,
            mode="DISCO-DOP")

        corpus = sentence_names_to_hybridtrees({str(x)
                                                for x in range(limit)},
                                               normal_corpus,
                                               disconnect_punctuation=False,
                                               add_vroot=True,
                                               mode="DISCO-DOP")
        term_labeling = terminal_labeling(corpus, threshold=4)

        grammar = None

        for htree, htree_bin in zip(corpus, corpus_bin):
            # print(htree_bin)

            try:
                htree_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                    htree_bin, term_labeling=term_labeling)
            except Exception as e:
                print(e)
                print(htree_bin)
                print(htree_bin.nodes())
                print(htree_bin.word_yield())
                raise e
            # print(htree_grammar)

            parser_input = term_labeling.prepare_parser_input(
                htree.token_yield())

            p = LCFRS_sDCP_Parser(htree_grammar,
                                  terminal_labelling=term_labeling)
            p.set_input(htree)
            p.parse()
            # p = LCFRS_parser(htree_grammar, parser_input)
            self.assertTrue(p.recognized())

            derivs = list(p.all_derivation_trees())
            # print("derivations:", len(derivs))

            for der in derivs:
                dcp = DCP_evaluator(der).getEvaluation()
                sys_tree = HybridTree(htree.sent_label())

                sys_tree = dcp_to_hybridtree(
                    sys_tree,
                    dcp,
                    deepcopy(htree.token_yield()),
                    ignore_punctuation=False,
                    construct_token=construct_constituent_token)
                # print(sys_tree)
                # print(htree == sys_tree)
                # print(der)
                if htree != sys_tree:
                    print(htree.sent_label())
                    print(htree)
                    print(sys_tree)

                self.assertEqual(htree, sys_tree)

            if grammar is None:
                grammar = htree_grammar
            else:
                grammar.add_gram(htree_grammar)

            htree_grammar.make_proper()
            try:
                disco_parser = DiscodopKbestParser(htree_grammar)
            except ValueError as ve:
                print(ve)
                print(htree.sent_label())
                print(htree)
                print(htree_bin)
                print(htree_grammar)
                raise ve

        grammar.make_proper()
        disco_parser = DiscodopKbestParser(grammar)
Beispiel #10
0
    def test_negra_to_dag_parsing(self):
        names = list(map(str, [26954]))

        fd_, primary_file = tempfile.mkstemp(suffix='.export')
        with open(primary_file, mode='w') as pf:

            for s in names:
                dsg = tp.sentence_names_to_deep_syntax_graphs(
                    ["s" + s],
                    "res/tiger/tiger_s%s.xml" % s,
                    hold=False,
                    ignore_puntcuation=False)[0]
                dsg.set_label(dsg.label[1:])
                lines = np.serialize_hybrid_dag_to_negra(
                    [dsg], 0, 500, use_sentence_names=True)
                print(''.join(lines), file=pf)

        _, binarized_file = tempfile.mkstemp(suffix='.export')
        subprocess.call([
            "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1",
            primary_file, binarized_file
        ])

        print(primary_file)
        print(binarized_file)

        corpus = np.sentence_names_to_hybridtrees(names,
                                                  primary_file,
                                                  secedge=True)
        corpus2 = np.sentence_names_to_hybridtrees(names,
                                                   binarized_file,
                                                   secedge=True)
        dag = corpus[0]
        print(dag)

        assert isinstance(dag, HybridDag)
        self.assertEqual(8, len(dag.token_yield()))
        for token in dag.token_yield():
            print(token.form() + '/' + token.pos(), end=' ')
        print()

        dag_bin = corpus2[0]
        print(dag_bin)

        for token in dag_bin.token_yield():
            print(token.form() + '/' + token.pos(), end=' ')
        print()
        self.assertEqual(8, len(dag_bin.token_yield()))

        for node, token in zip(
                dag_bin.nodes(),
                list(map(str, map(dag_bin.node_token, dag_bin.nodes())))):
            print(node, token)

        print()
        print(top(dag_bin, {'500', '101', '102'}))
        self.assertSetEqual({'101', '500'}, top(dag_bin,
                                                {'500', '101', '102'}))
        print(bottom(dag_bin, {'500', '101', '102'}))
        self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'}))

        nont_labeling = BasicNonterminalLabeling()
        term_labeling = FormTerminals()  # PosTerminals()

        grammar = direct_extract_lcfrs_from_prebinarized_corpus(
            dag_bin, term_labeling, nont_labeling)
        # print(grammar)

        for rule in grammar.rules():
            print(rule.get_idx(), rule)

        print("Testing LCFRS parsing and DCP evaluation".center(80, '='))

        parser = LCFRS_parser(grammar)

        parser_input = term_labeling.prepare_parser_input(
            dag_bin.token_yield())
        print(parser_input)
        parser.set_input(parser_input)

        parser.parse()

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        print(der)

        dcp_term = DCP_evaluator(der).getEvaluation()

        print(dcp_term[0])

        dag_eval = HybridDag(dag_bin.sent_label())
        dcp_to_hybriddag(dag_eval,
                         dcp_term,
                         copy.deepcopy(dag_bin.token_yield()),
                         False,
                         construct_token=construct_constituent_token)

        print(dag_eval)
        for node in dag_eval.nodes():
            token = dag_eval.node_token(node)
            if token.type() == "CONSTITUENT-CATEGORY":
                label = token.category()
            elif token.type() == "CONSTITUENT-TERMINAL":
                label = token.form(), token.pos()

            print(node, label, dag_eval.children(node),
                  dag_eval.sec_children(node), dag_eval.sec_parents(node))

        lines = np.serialize_hybridtrees_to_negra([dag_eval],
                                                  1,
                                                  500,
                                                  use_sentence_names=True)
        for line in lines:
            print(line, end='')

        print()

        with open(primary_file) as pcf:
            for line in pcf:
                print(line, end='')

        print('Testing reduct computation with Schick parser'.center(80, '='))

        grammar_path = '/tmp/lcfrs_dcp_grammar.gr'
        derivation_manager = PyDerivationManager(grammar)

        with open(grammar_path, 'w') as grammar_file:
            nonterminal_enc, terminal_enc = linearize(
                grammar,
                nont_labeling,
                term_labeling,
                grammar_file,
                delimiter=' : ',
                nonterminal_encoder=derivation_manager.get_nonterminal_map())

        print(np.negra_to_json(dag, terminal_enc, term_labeling))
        json_data = np.export_corpus_to_json([dag], terminal_enc,
                                             term_labeling)

        corpus_path = '/tmp/json_dags.json'
        with open(corpus_path, 'w') as data_file:
            json.dump(json_data, data_file)

        reduct_dir = '/tmp/schick_parser_reducts'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)

        p = subprocess.Popen([
            ' '.join([
                "java", "-jar",
                os.path.join("util",
                             SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path,
                '-t', corpus_path, "--input-format", "json", "-o", reduct_dir
            ])
        ],
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)
        rtgs = []

        def decode_nonterminals(s):
            return derivation_manager.get_nonterminal_map().index_object(
                int(s))

        for i in range(1, len(corpus) + 1):
            rtgs.append(
                read_rtg(os.path.join(reduct_dir,
                                      str(i) + '.gra'),
                         symbol_offset=-1,
                         rule_prefix='r',
                         process_nonterminal=decode_nonterminals))

        print("Reduct RTG")
        for rule in rtgs[0].rules:
            print(rule.lhs, "->", rule.symbol, rule.rhs)

        derivation_manager.get_nonterminal_map().print_index()
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(
            bytes('/tmp/reduct_manager.trace', encoding='utf8'))
        derivations = [
            LCFRSDerivationWrapper(der)
            for der in derivation_manager.enumerate_derivations(0, grammar)
        ]
        self.assertGreaterEqual(len(derivations), 1)

        if len(derivations) >= 1:
            print("Sentence", i)
            for der in derivations:
                print(der)
                self.assertTrue(
                    der.check_integrity_recursive(der.root_id(),
                                                  grammar.start()))
Beispiel #11
0
 def get_whole_corpus(self, n=N_NEGRA_SENTENCES):
     return sentence_names_to_hybridtrees(
         names=[num_to_name(num) for num in range(n + 1)], path=NEGRA_PATH)
Beispiel #12
0
 def get_shortest_tree(self):
     trees = sentence_names_to_hybridtrees(
         names=[num_to_name(num) for num in range(N_NEGRA_SENTENCES)],
         path=NEGRA_PATH)
     return min(trees, key=lambda tree: tree.n_yield_nodes())
Beispiel #13
0
 def get_trees_for_single_sentence(id=37):
     return sentence_names_to_hybridtrees(names=[num_to_name(id)],
                                          path=NEGRA_PATH)