Example #1
0
def evaluate_classifier_on_instances(inst_list, classifier, feat_list, pos_class_matrix, gold_tagmap=None):
    """
    Given a list of instances, do the evaluation on them.

    :param inst_list:
    :param classifier:
    :param tagger:
    :return:
    """

    pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False

    matches = 0
    compares = 0

    for inst in inst_list:
        sup_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_MANUAL)
        if sup_postier is None:
            continue
        gw_tier = gloss(inst)
        classify_gloss_pos(inst, classifier,
                           posdict=pd,
                           feat_prev_gram=CLASS_FEATS_PRESW in feat_list,
                           feat_next_gram=CLASS_FEATS_NEXSW in feat_list,
                           feat_dict=CLASS_FEATS_DICT in feat_list,
                           feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list,
                           feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list,
                           feat_suffix=CLASS_FEATS_SUF in feat_list,
                           feat_prefix=CLASS_FEATS_PRE in feat_list,
                           feat_morph_num=CLASS_FEATS_NUMSW in feat_list,
                           feat_has_number=CLASS_FEATS_NUM in feat_list,
                           feat_basic=CLASS_FEATS_SW in feat_list)


        cls_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_CLASS)


        for cls_tag in cls_postier:
            word = xigt_find(gw_tier, id=cls_tag.alignment)
            sup_tag = xigt_find(sup_postier, alignment=cls_tag.alignment)

            if sup_tag is None:
                continue
            else:
                sup_tag_v = sup_tag.value()
                if gold_tagmap is not None:
                    sup_tag_v = gold_tagmap.get(sup_tag_v)

            pos_class_matrix.add(sup_tag_v, cls_tag.value())
            if cls_tag.value() == sup_tag_v:
                matches += 1
            compares += 1

    return matches, compares, matches/compares*100
Example #2
0
    def word_align_test(self):
        """
        Test that the gloss has been automatically aligned at the word level correctly.
        """
        at = Alignment()
        for gw in gloss(self.igt):
            gw_idx = item_index(gw)
            lw = xigt_find(self.igt, id=gw.alignment)
            if lw is not None:
                at.add((gw_idx, item_index(lw)))


        self.assertEqual(at, Alignment([(1,1),(2,2),(3,3),(4,4)]))
Example #3
0
def gather_gloss_pos_stats(inst, subword_dict, feat_list):
    """
    Given an instance, look for the gloss pos tags, and save the statistics
    about them, so that we can filter by the number of times each kind was
    seen later.

    :param inst: Instance to process.
    :type inst: RGIgt
    :param subword_dict: This dictionary will record the number of times each (word, TAG)
                          pair has been seen.
    :type subword_dict: SubwordDict
    :param gram_tag_dict: This dictionary will record the number of times individual grams are seen.
    :type gram_tag_dict: TwoLevelCountDict
    """

    # Grab the gloss POS tier...
    gpos_tier = gloss_tag_tier(inst)
    lpos_tier = lang_tag_tier(inst)
    gw_tier = gloss(inst)

    if CLASS_FEATS_ALN in feat_list:
        heur_align_inst(inst)
        get_trans_glosses_alignment(inst, aln_method=INTENT_ALN_HEUR)

    # If there are POS tags on the language line but not the gloss line...
    if gpos_tier is None and lpos_tier is not None:
        add_gloss_lang_alignments(inst)
        project_lang_to_gloss(inst)
        gpos_tier = gloss_tag_tier(inst)


    # If this tier exists, then let's process it.
    if gpos_tier is not None:

        # Iterate over each gloss POS tag...
        for i, gw in enumerate(gw_tier):
            tag = xigt_find(inst, alignment=gw.id)

            if tag is None:
                continue

            prev_word = gw_tier[i-1].value().lower() if i > 0 else None
            next_word = gw_tier[i+1].value().lower() if i < len(gw_tier)-1 else None

            if CLASS_FEATS_ALN in feat_list:
                subword_dict.add_word_tag(gw.value().lower(), tag.value(), prev_word, next_word)
Example #4
0
def extract_sents_from_inst(inst: Igt, out_src, out_tgt, aln_method=None, no_alignment_heur = True, sent_type=SENT_TYPE_T_G):
    """
    Extract parallel sentences from an instance. Either:

    1) Translation--Gloss
    2) Translation--Language
    """

    # -------------------------------------------
    # 1) Get the source string (translation)
    # -------------------------------------------
    src_str = tier_text(trans(inst), remove_whitespace_inside_tokens=True).lower()


    # -------------------------------------------
    # 2) Decide whether the target string is gloss or language.
    # -------------------------------------------
    if sent_type == SENT_TYPE_T_L:
        tgt_str = tier_text(lang(inst), remove_whitespace_inside_tokens=True).lower()
    elif sent_type == SENT_TYPE_T_G:
        tgt_str = tier_text(gloss(inst), remove_whitespace_inside_tokens=True).lower()
    else:
        raise Exception("Invalid sent type")

    # -------------------------------------------
    # 3) Write the choice out to disk.
    # -------------------------------------------
    out_src.write(src_str + '\n')
    out_tgt.write(tgt_str + '\n')
    out_src.flush()
    out_tgt.flush()

    # -------------------------------------------
    # 4) Add heuristic alignments, if asked for.
    # -------------------------------------------
    if not no_alignment_heur:

        pairs = get_trans_aligned_wordpairs(inst, aln_method=aln_method, add_align=True, sent_type=sent_type)
        for src_word, tgt_word in pairs:
            out_src.write(src_word.lower() + '\n')
            out_tgt.write(tgt_word.lower() + '\n')
Example #5
0
def naacl_to_xigt(naacl_path):
    """
    Convert the NAACL format to XIGT.

    :param naacl_path:
    """
    content = open(naacl_path, 'r').read()

    # First, collect all the instances.
    instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content)

    xc = XigtCorpus()

    for instance_txt in instances:
        # id = re.search('Igt_id=([\S]+)', instance_txt).group(1)
        inst = Igt(id='i{}'.format(len(xc)))

        lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4]

        # Now, create the raw tier...
        raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE})
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}))

        inst.append(raw_tier)
        xc.append(inst)

        # Generate the clean/normal tiers, but without any cleaning.
        generate_normal_tier(inst, clean=False)

        # Lang Dependency representation handling...
        lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1)
        lang_ds_lines = lang_ds_str.split('\n')[5:-3]

        try:
            lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines)
            create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass

        # Eng DS handling...
        eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1)
        eng_ds_lines = eng_ds_str.split('\n')[2:-3]

        try:
            eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines)
            create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass
        except ValueError as ve:
            pass

        # Add Alignment...
        biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1)
        biling_aln_lines = biling_aln_str.split('\n')[4:-3]

        trans_offset = trans_raw.startswith(' ')
        gloss_offset = gloss_raw.startswith(' ')

        try:
            a = Alignment()
            for line in biling_aln_lines:
                gloss_s, trans_s = line.split()[0:2]

                if '.' in gloss_s:
                    continue

                gloss_i = int(gloss_s)

                for trans_token in trans_s.split(','):
                    trans_i = int(trans_token)
                    if trans_i == 0:
                        continue
                    else:
                        if trans_offset:
                            trans_i -= 1
                        if gloss_offset:
                            gloss_i -= 1
                        a.add((trans_i, gloss_i))
        except:
            pass

        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL)

    return xc
Example #6
0
 def line_test(self):
     """
     Test that lines are rendered correctly.
     """
     self.assertEqual(tier_text(gloss(self.igt)), 'I-Nom child-Dat rice-Acc eat-Caus-Pst-Dec')
     self.assertEqual(tier_text(trans(self.igt)), 'I made the child eat rice')
Example #7
0
def convert_pml(aln_path, out_path, hindi=True):

    if hindi:
        igt_data = retrieve_hindi()
    else:
        igt_data = retrieve_naacl()

    a_root = load_xml(aln_path)
    doc_a  = a_root.find(".//reffile[@name='document_a']").get('href')
    doc_b  = a_root.find(".//reffile[@name='document_b']").get('href')



    doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a))
    doc_b  = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b))

    # Load the sentences for each document.
    a_sents, a_glossed = load_sents(doc_a)
    b_sents, b_glossed = load_sents(doc_b)



    sent_alignments = a_root.findall(".//body/LM")

    assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses"

    xc = XigtCorpus()

    for sent_alignment in sent_alignments:

        # Get the sentence id...
        aln_id = sent_alignment.attrib.get('id')
        a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1)
        if a_snt_id not in igt_data:
            continue

        # Get the text and tokens from the naacl data.
        pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id]
        lang_tokens = lang_txt.split()
        gloss_tokens = gloss_txt.split()
        trans_tokens = trans_txt.split()

        a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1]
        b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1]

        word_alignments = sent_alignment.findall('./node_alignments/LM')

        a_snt, a_edges = a_sents[a_snt_ref]
        b_snt, b_edges = b_sents[b_snt_ref]

        assert isinstance(a_snt, Sentence)
        assert isinstance(b_snt, Sentence)
        # -------------------------------------------
        # Skip sentences if they are not found for whatever reason
        # -------------------------------------------
        if not a_snt or not b_snt:
            continue

        # -------------------------------------------
        # Start constructing the IGT Instance.
        # -------------------------------------------

        trans_snt, trans_indices = a_snt, a_edges
        gloss_snt, gloss_indices = b_snt, b_edges
        if a_glossed:
            trans_snt, trans_indices = b_snt, b_edges
            gloss_snt, gloss_indices = a_snt, a_edges

        # Hindi stuff...
        if hindi:
            lang_tokens = [w.text for w in gloss_snt]
            lang_postags   = [w.pos  for w in gloss_snt]
            lang_txt    = ' '.join(lang_tokens)

            trans_tokens = [w.text for w in trans_snt]
            trans_postags   = [w.pos  for w in trans_snt]
            trans_txt    = ' '.join(trans_tokens)

            gloss_tokens  = [w.gloss if w.gloss else 'NULL' for w in gloss_snt]
            gloss_postags = lang_postags
            gloss_txt     = ' '.join(gloss_tokens)



        inst = Igt(id=re.sub('s-', 'igt', a_snt_ref))
        nt   = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE})
        ll   = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt)
        gl   = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt)
        tl   = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt)
        nt.extend([ll,gl,tl])
        inst.append(nt)


        # -------------------------------------------
        # Handle the phrase tiers
        # -------------------------------------------
        generate_lang_phrase_tier(inst)
        generate_trans_phrase_tier(inst)

        def process_postags(sent, tokens):
            postags = []
            for i, token in enumerate(tokens):
                word = sent.getorder(i+1)
                if word is None:
                    postags.append(None)
                else:
                    postags.append(word.pos)
            return postags

        # -------------------------------------------
        # Now, handle the translation words.
        # -------------------------------------------
        tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0])
        inst.append(tt)

        if not hindi:
            trans_postags = process_postags(trans_snt, trans_tokens)

        add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL)


        # -------------------------------------------
        # Handle the words tiers...
        # -------------------------------------------
        wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0])
        gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl)
        inst.extend([wt, gwt])
        # Quickly set the alignment for the gloss words.
        for w, gw in zip(wt, gwt):
            gw.alignment = w.id


        if not hindi:
            lang_postags = process_postags(gloss_snt, gloss_tokens)
            gloss_postags = lang_postags

        add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL)
        add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL)

        create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL)
        create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL)



        # -------------------------------------------
        # Now, the word alignments.
        # -------------------------------------------
        a = Alignment()
        for word_alignment in word_alignments:
            a_ref = word_alignment.find('./a.rf').text.split('#')[1]
            b_ref = word_alignment.find('./b.rf').text.split('#')[1]

            a_word = a_snt.getid(a_ref)
            b_word = b_snt.getid(b_ref)

            if a_word is None or b_word is None:
                continue

            if not hindi:
                a_idx  = a_word.order
                b_idx  = b_word.order
            else:
                a_idx  = a_snt.index(a_word)+1
                b_idx  = b_snt.index(b_word)+1

            # Make sure the gloss is in the
            if a_glossed:
                trans_idx = b_idx
                lang_idx  = a_idx
            else:
                trans_idx = a_idx
                lang_idx  = b_idx

            a.add((trans_idx, lang_idx))


        set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL)
        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL)

        xc.append(inst)

    with open(out_path, 'w', encoding='utf-8') as f:
        xigtxml.dump(f, xc)
Example #8
0
 def test_line_lengths(self):
     inst = self.xc[1]
     self.assertEqual(5, len(gloss(inst)))
     self.assertEqual(6, len(lang(inst)))
Example #9
0
def evaluate_pos_projections_on_file(lang, xc, plma, pos_proj_matrix, tagger, gold_tagmap=None, trans_tagmap=None, outstream=sys.stdout):
    """
    :type plma: PerLangMethodAccuracies
    :type pos_proj_matrix: POSMatrix
    """
    new_xc = XigtCorpus(xc.id)
    for inst in xc:

        gtt = gloss_tag_tier(inst, INTENT_POS_MANUAL)
        ttt = trans_tag_tier(inst, INTENT_POS_MANUAL)
        m_aln = get_trans_gloss_alignment(inst, INTENT_ALN_MANUAL)

        # Only continue if we have manual gloss tags, trans tags, and manual alignment.
        if gtt is None or m_aln is None or ttt is None:
            continue

        # Get the heuristic alignment...
        h_aln = heur_align_inst(inst)

        # And tag the translation line.
        tag_trans_pos(inst, tagger=tagger)

        # Now, iterate through each alignment method and set of tags.
        for aln_method in [INTENT_ALN_MANUAL, INTENT_ALN_HEUR]:
            for trans_tag_method in [INTENT_POS_MANUAL, INTENT_POS_TAGGER]:
                project_trans_pos_to_gloss(inst, aln_method=aln_method, trans_tag_method=trans_tag_method)
                proj_gtt = gloss_tag_tier(inst, tag_method=INTENT_POS_PROJ)

                # Go through each word in the gloss line and, if it has a gold
                # tag, was it correct?
                matches = 0
                compares = 0
                for gw in gloss(inst):
                    gold_tag = xigt_find(gtt, alignment=gw.id)
                    proj_tag = xigt_find(proj_gtt, alignment=gw.id)

                    if gold_tag is not None:
                        gold_tag_v = gold_tag.value()

                        # Remap the tags if asked...
                        if gold_tagmap is not None:
                            try:
                                gold_tag_v = gold_tagmap.get(gold_tag_v)
                            except TagMapException:
                                pass

                        if proj_tag is None:
                            proj_str = '**UNK'
                        else:
                            proj_str = proj_tag.value()
                            if trans_tagmap is not None:
                                # Try to remap the tag, but keep it if it can't be remapped.
                                try:
                                    proj_str = trans_tagmap.get(proj_str)
                                except TagMapException:
                                    pass

                        pos_proj_matrix.add(gold_tag_v, proj_str)

                        if proj_tag is not None and proj_str == gold_tag_v:
                            matches += 1
                        compares += 1


                plma.add(lang, '{}:{}'.format(aln_method, trans_tag_method), matches, compares)

    outstream.write('{}\n'.format(plma))





    return new_xc