Beispiel #1
0
    def test_align_extract(self):
        inst = copy_xigt(self.xc[0])
        # rgp(get_bilingual_alignment_tier(inst, trans(inst).id, glosses(inst).id, aln_method=INTENT_ALN_HEURPOS))
        aheur = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_HEUR)
        aheurpos = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_HEURPOS)
        agiza = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_GIZA)
        agizaheur = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_GIZAHEUR)
        
        a1 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (8, 7), (11, 8)])
        a2 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 12), (8, 7), (9, 10), (11, 8), (12, 10), (13, 11)])
        a3 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 5), (9, 7), (10, 12), (11, 8), (12, 12), (13, 11), (14, 12)])
        a4 = Alignment([(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 5), (7, 5), (9, 7), (10, 12), (11, 8), (12, 12), (13, 11), (14, 11)])

        self.assertEqual(aheur, a1)
        self.assertEqual(aheurpos, a2)
        self.assertEqual(agiza, a3)
        self.assertEqual(agizaheur, a4)
Beispiel #2
0
    def add_corpus(self, name, method, lang, xc, lang_trans = False):
        """
        :type method: str
        :type xc: RGCorpus
        """
        for inst in xc:
            # -------------------------------------------
            # Only continue when there is a gold standard
            # alignment to compare against.
            # -------------------------------------------
            gold = get_trans_gloss_alignment(inst, aln_method=ARG_ALN_MANUAL)
            if gold is None:
                continue

            if lang_trans:
                aln = get_trans_lang_alignment(inst, aln_method=method)
            else:
                aln = get_trans_gloss_alignment(inst, aln_method=method)

            self.add_alignment(name, lang, inst.id, aln)
Beispiel #3
0
    def test_read_proj_ds_tree(self):
        src_t = get_ds(self.inst2, trans(self.inst2))
        tgt_w = lang(self.inst2)
        aln   = get_trans_gloss_alignment(self.inst2)

        tgt_t = DepTree.fromstring("""
        (ROOT[0]
            (glaubst[2]
                (Was[1])
                (Du[3])
                (wer[4])
                (angerufen[5] (hat[6]))
            ))
        """, stype=DEPSTR_PTB)

        proj_t = project_ds(src_t, tgt_w, aln)

        self.assertTrue(proj_t.structurally_eq(tgt_t))
Beispiel #4
0
def evaluate_heuristic_methods_on_file(f, xc, mas, classifier_obj, tagger_obj, lang, pool=None, lock=None):
    EVAL_LOG.info('Evaluating heuristic methods on file "{}"'.format(os.path.basename(f)))




    for inst in xc:

        # -------------------------------------------
        # Only evaluate against instances that have a gold alignment.
        manual = get_trans_gloss_alignment(inst, aln_method=INTENT_ALN_MANUAL)

        if manual is None:
            continue

        EVAL_LOG.debug('Running heuristic alignments on instance "{}"'.format(inst.id))

        heur = heur_align_inst(copy_xigt(inst), lowercase=False, stem=False, tokenize=False, no_multiples=True, use_pos=False)
        mas.add_alignment('baseline', lang, inst.id, heur)

        heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=False, tokenize=False, no_multiples=True, use_pos=False)
        mas.add_alignment('lowercasing', lang, inst.id, heur)

        heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=False, tokenize=True, no_multiples=True, use_pos=False)
        mas.add_alignment('Tokenization', lang, inst.id, heur)

        heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=False, tokenize=True, no_multiples=False, use_pos=False)
        mas.add_alignment('Multiple Matches', lang, inst.id, heur)

        heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=True, tokenize=True, no_multiples=False, use_pos=False)
        mas.add_alignment('Morphing', lang, inst.id, heur)

        heur = heur_align_inst(copy_xigt(inst), lowercase=True, stem=True, tokenize=True, no_multiples=False, grams=True, use_pos=False)
        mas.add_alignment('Grams', lang, inst.id, heur)


        b = copy_xigt(inst)
        classify_gloss_pos(b, classifier_obj)
        tag_trans_pos(b, tagger_obj)
        heur = heur_align_inst(b, lowercase=True, stem=True, tokenize=True, no_multiples=False, grams=True, use_pos=True)
        mas.add_alignment('POS', lang, inst.id, heur)
Beispiel #5
0
def enrich(**kwargs):

    global classifier

    if ARG_OUTFILE not in kwargs:
        ENRICH_LOG.critical("No output file specified.")
        sys.exit()

    # =============================================================================
    # Set up the alternate classifier path...
    # =============================================================================

    class_path = kwargs.get('class_path')

    #===========================================================================
    # Set up the different arguments...
    #===========================================================================
    inpath = kwargs.get(ARG_INFILE)

    parse_args = kwargs.get(PARSE_VAR, [])
    pos_args = kwargs.get(POS_VAR, [])
    aln_args = kwargs.get(ALN_VAR, [])

    max_parse_length = kwargs.get('max_parse_length', 10)

    if not (parse_args or pos_args or aln_args):
        ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.")

    #===========================================================================
    # Sanity check the arguments.
    #===========================================================================

    # Check that alignment is asked for if projection is asked for.
    if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args):
        ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \
                        "alignments to be generated. Projection may fail if alignment not already present in file.")

    ENRICH_LOG.log(1000, 'Loading input file...')
    with open(inpath, 'r', encoding='utf-8') as in_f:
        corp = xigtxml.load(in_f, mode=INCREMENTAL)

        # -------------------------------------------
        # Initialize the English tagger if:
        #   A) "proj" option is selected for pos.
        #   B) "trans" option is given for pos.
        #   C) "heurpos" option is given for alignment.
        # -------------------------------------------
        s = None
        if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, 'Initializing tagger...')
            tagger = c.getpath('stanford_tagger_trans')

            try:
                s = StanfordPOSTagger(tagger)
            except TaggerError as te:
                ENRICH_LOG.critical(te)
                sys.exit(2)

        # -------------------------------------------
        # Initialize the parser if:
        #    A) "trans" option is given for parse
        #    B) "proj" option is given for parse.
        # -------------------------------------------
        if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args:
            ENRICH_LOG.log(1000, "Intializing English parser...")
            sp = stanford_parser.StanfordParser()

        # -------------------------------------------
        # Initialize the classifier if:
        #    A) "class" option is given for pos
        #    B) "heurpos" option is given for alignment.
        # -------------------------------------------
        m = None
        if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, "Initializing gloss-line classifier...")
            p = load_posdict()
            m = mallet_maxent.MalletMaxent(classifier)


        # -- 1b) Giza Gloss to Translation alignment --------------------------------------
        if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args:
            ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...')

            try:
                if ARG_ALN_GIZAHEUR in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
                if ARG_ALN_GIZA in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
            except GizaAlignmentException as gae:
                gl = logging.getLogger('giza')
                gl.critical(str(gae))
                raise gae

        # -------------------------------------------
        # Begin iterating through the corpus
        # -------------------------------------------

        for inst in corp:

            feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id)

            reasons = []
            inst_status = None

            def fail(reason):
                nonlocal inst_status, reasons
                if reason not in reasons:
                    reasons.append(reason)
                inst_status = 'WARN'

            def success():
                nonlocal inst_status
                inst_status = 'OK'

            # -------------------------------------------
            # Define the reasons for failure
            # -------------------------------------------
            F_GLOSS_LINE = "NOGLOSS"
            F_LANG_LINE  = "NOLANG"
            F_TRANS_LINE = "NOTRANS"
            F_BAD_LINES  = "BADLINES"
            F_L_G_ALN    = "L_G_ALIGN"
            F_T_G_ALN    = "G_T_ALIGN"
            F_NO_TRANS_POS="NO_POS_TRANS"
            F_PROJECTION = "PROJECTION"
            F_UNKNOWN    = "UNKNOWN"
            F_PARSELEN   = "OVER_MAX_LENGTH"


            try:

                # -------------------------------------------
                # Get the different lines
                # -------------------------------------------
                def tryline(func):
                    nonlocal inst
                    try:
                        return func(inst)
                    except NoNormLineException as nnle:
                        return None

                gl = tryline(gloss_line)
                tls = tryline(trans_lines)
                lls  = tryline(lang_lines)

                has_gl = gl is not None
                has_tl = tls is not None
                has_ll = lls is not None

                has_all = lambda: (has_gl and has_tl and has_ll)


                # -------------------------------------------
                # Translation Line
                # -------------------------------------------
                if has_tl:

                    if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:

                        try:
                            tag_trans_pos(inst, s)
                        except CriticalTaggerError as cte:
                            ENRICH_LOG.critical(str(cte))
                            sys.exit(2)

                    if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args:
                        if len(trans(inst)) <= max_parse_length:
                            parse_translation_line(inst, sp, pt=True, dt=True)
                        else:
                            fail(F_PARSELEN)

                # 4) POS tag the gloss line --------------------------------------------
                if has_gl:
                    if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
                        classify_gloss_pos(inst, m, posdict=p)

                # -------------------------------------------
                # Try getting alignments.
                # -------------------------------------------
                if has_gl and has_ll:
                    try:
                        add_gloss_lang_alignments(inst)
                    except GlossLangAlignException as glae:
                        fail(F_L_G_ALN)

                if has_gl and has_tl:
                    if ARG_ALN_HEURPOS in aln_args:
                        heur_align_inst(inst, use_pos=True)
                    if ARG_ALN_HEUR in aln_args:
                        heur_align_inst(inst, use_pos=False)

                # -------------------------------------------
                # Now, do the necessary projection tasks.
                # -------------------------------------------

                # Project the classifier tags...
                if has_ll and has_gl and ARG_POS_CLASS in pos_args:
                    try:
                        project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS)
                    except GlossLangAlignException:
                        fail(F_L_G_ALN)

                # -------------------------------------------
                # Do the trans-to-lang projection...
                # -------------------------------------------

                if has_all():
                    proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)]
                    aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method)
                    if not aln or len(aln) == 0:
                        fail(F_T_G_ALN)
                    else:
                        # -------------------------------------------
                        # POS Projection
                        # -------------------------------------------
                        if ARG_POS_PROJ in pos_args:
                            trans_tags = trans_tag_tier(inst)

                            if not trans_tags:
                                fail(F_NO_TRANS_POS)
                            else:
                                project_trans_pos_to_gloss(inst)
                                try:
                                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                                except GlossLangAlignException as glae:
                                    fail(F_L_G_ALN)

                        # -------------------------------------------
                        # Parse projection
                        # -------------------------------------------
                        if ARG_PARSE_PROJ in parse_args:
                            try:
                                project_pt_tier(inst, proj_aln_method=proj_aln_method)
                            except PhraseStructureProjectionException as pspe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)

                            try:
                                project_ds_tier(inst, proj_aln_method=proj_aln_method)
                            except ProjectionException as pe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)



                # Sort the tiers... ----------------------------------------------------
                inst.sort_tiers()

            except Exception as e:
                # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id))
                ENRICH_LOG.debug(e)
                # raise(e)
                fail(F_UNKNOWN)

            if not reasons:
                success()


            ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons)))

        ENRICH_LOG.log(1000, 'Writing output file...')

        if hasattr(kwargs.get(ARG_OUTFILE), 'write'):
            xigtxml.dump(kwargs.get(ARG_OUTFILE), corp)
        else:
            xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp)

        ENRICH_LOG.log(1000, 'Done.')
        ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
Beispiel #6
0
def evaluate_pos_projections_on_file(lang, xc, plma, pos_proj_matrix, tagger, gold_tagmap=None, trans_tagmap=None, outstream=sys.stdout):
    """
    :type plma: PerLangMethodAccuracies
    :type pos_proj_matrix: POSMatrix
    """
    new_xc = XigtCorpus(xc.id)
    for inst in xc:

        gtt = gloss_tag_tier(inst, INTENT_POS_MANUAL)
        ttt = trans_tag_tier(inst, INTENT_POS_MANUAL)
        m_aln = get_trans_gloss_alignment(inst, INTENT_ALN_MANUAL)

        # Only continue if we have manual gloss tags, trans tags, and manual alignment.
        if gtt is None or m_aln is None or ttt is None:
            continue

        # Get the heuristic alignment...
        h_aln = heur_align_inst(inst)

        # And tag the translation line.
        tag_trans_pos(inst, tagger=tagger)

        # Now, iterate through each alignment method and set of tags.
        for aln_method in [INTENT_ALN_MANUAL, INTENT_ALN_HEUR]:
            for trans_tag_method in [INTENT_POS_MANUAL, INTENT_POS_TAGGER]:
                project_trans_pos_to_gloss(inst, aln_method=aln_method, trans_tag_method=trans_tag_method)
                proj_gtt = gloss_tag_tier(inst, tag_method=INTENT_POS_PROJ)

                # Go through each word in the gloss line and, if it has a gold
                # tag, was it correct?
                matches = 0
                compares = 0
                for gw in gloss(inst):
                    gold_tag = xigt_find(gtt, alignment=gw.id)
                    proj_tag = xigt_find(proj_gtt, alignment=gw.id)

                    if gold_tag is not None:
                        gold_tag_v = gold_tag.value()

                        # Remap the tags if asked...
                        if gold_tagmap is not None:
                            try:
                                gold_tag_v = gold_tagmap.get(gold_tag_v)
                            except TagMapException:
                                pass

                        if proj_tag is None:
                            proj_str = '**UNK'
                        else:
                            proj_str = proj_tag.value()
                            if trans_tagmap is not None:
                                # Try to remap the tag, but keep it if it can't be remapped.
                                try:
                                    proj_str = trans_tagmap.get(proj_str)
                                except TagMapException:
                                    pass

                        pos_proj_matrix.add(gold_tag_v, proj_str)

                        if proj_tag is not None and proj_str == gold_tag_v:
                            matches += 1
                        compares += 1


                plma.add(lang, '{}:{}'.format(aln_method, trans_tag_method), matches, compares)

    outstream.write('{}\n'.format(plma))





    return new_xc