Esempio n. 1
0
    def test_ds_project(self):
        xc = xc_load(os.path.join(testfile_dir, 'xigt/index_error.xml'), do_basic_processing=True)
        inst = xc[0]
        heur_align_inst(inst)
        parse_translation_line(inst, dt=True)
        project_ds_tier(inst)
        proj_t = get_lang_ds(inst)

        tgt_t = DepTree.fromstring("""(ROOT[0] (salli-i[2] (Jumala[1]) (sata-a[4] ([[3])) (rake-i-ta[5]) (ja[6]) (tuhka-a[7] (].[8]))))""", stype=DEPSTR_PTB)

        self.assertTrue(tgt_t.similar(proj_t))

        inst2 = xc[1]
        heur_align_inst(inst2)
        parse_translation_line(inst2, dt=True)
        project_ds_tier(inst2)

        print(inst2)

        tgt2_t = DepTree.fromstring("""(ROOT[0]
                                            (unohta-a[2] (*Minua[1])
                                                (unohda-n[4]
                                                    (/Minä[3])
                                                    (/laula-tta-a[6] (pelo-tta-a[5]))
                                                )
                                            ))
                                        """, stype=DEPSTR_PTB)

        self.assertTrue(get_lang_ds(inst2), tgt2_t)
Esempio n. 2
0
    def test_conll(self):
        ds = get_lang_ds(self.inst2)
        conll_str = ds.to_conll(lowercase=False)

        s = """
1	Was	_	PRON	PRON	_	2	_	_	_
2	glaubst	_	VERB	VERB	_	0	root	_	_
3	Du	_	PRON	PRON	_	2	nsubj	_	_
4	wer	_	PRON	PRON	_	2	dobj	_	_
5	angerufen	_	VERB	VERB	_	2	dep	_	_
6	hat	_	VERB	VERB	_	5	_	_	_"""

        self.assertEqual(conll_str.strip(), s.strip())
Esempio n. 3
0
        def eval_method(aln_method):

            # Set up the gold instances
            gold_edges = set(gold_ds.to_indices())
            # Add the number of compares, (the gold edges)
            # and currently 0 for matches...
            plma.add(lang, aln_method, 0, len(gold_edges))

            # Try to do the projection
            try:
                project_ds_tier(inst, proj_aln_method=aln_method, ds_source=INTENT_DS_PARSER)
                ds = get_lang_ds(inst, parse_method=INTENT_DS_PROJ)
                tgt_edges  = set(ds.to_indices())
                # Add the number of matches, with 0 compares, since we added
                # those previously.
                plma.add(lang, aln_method, len(gold_edges & tgt_edges), 0)

            except TreeProjectionError:
                pass
Esempio n. 4
0
def evaluate_ds_projections_on_file(lang, xc, plma, outstream=sys.stdout):
    """
    :type plma: PerLangMethodAccuracies
    """
    matches    = 0
    compares   = 0

    aln_methods=[INTENT_ALN_GIZA, INTENT_ALN_GIZAHEUR, INTENT_ALN_HEUR, INTENT_ALN_HEURPOS, INTENT_ALN_MANUAL]

    for inst in xc:
        # giza_align_t_g(inst)

        gold_ds = get_lang_ds(inst, parse_method=INTENT_DS_MANUAL)
        if not gold_ds:
            continue

        # -------------------------------------------
        # If we have a gold standard DS, let's continue.
        # -------------------------------------------
        def eval_method(aln_method):

            # Set up the gold instances
            gold_edges = set(gold_ds.to_indices())
            # Add the number of compares, (the gold edges)
            # and currently 0 for matches...
            plma.add(lang, aln_method, 0, len(gold_edges))

            # Try to do the projection
            try:
                project_ds_tier(inst, proj_aln_method=aln_method, ds_source=INTENT_DS_PARSER)
                ds = get_lang_ds(inst, parse_method=INTENT_DS_PROJ)
                tgt_edges  = set(ds.to_indices())
                # Add the number of matches, with 0 compares, since we added
                # those previously.
                plma.add(lang, aln_method, len(gold_edges & tgt_edges), 0)

            except TreeProjectionError:
                pass

        for aln_method in aln_methods:
            eval_method(aln_method)
Esempio n. 5
0
def extract_parser_from_instance(inst: Igt, output_stream, pos_source, tm):
    """
    Given an IGT instance, extract the projected dependency structure from
    it (along with the POS tags from the given pos_source)

    :param inst: Input instance
    :param output_stream: The output stream to write the training data to.
    """
    extracted = 0
    try:
        ds = get_lang_ds(inst, pos_source=pos_source, unk_pos_handling=None)
        if ds is not None:
            conll_string = to_conll(ds, lang(inst), lowercase=True, match_punc=True, clean_token=True, unk_pos='UNK', tagmap=tm)
            output_stream.write(conll_string+'\n\n')
            output_stream.flush()
            extracted += 1

    except RuntimeError as re:
        print(re)
        EXTRACT_LOG.error("Runtime error in instance {}".format(inst.id))
    except RGXigtException as rgxe:
        EXTRACT_LOG.warn('Instance "{}" failed with "{}"'.format(inst.id, rgxe))

    return extracted
Esempio n. 6
0
 def all_tokens_present_test(self):
     project_ds_tier(self.inst)
     ds = get_lang_ds(self.inst)
     print(ds.to_conll())