Example #1
0
def evaluate_lang_pos(lang, xc, e_tagger, pos_pla, gold_tagmap=None, outstream=sys.stdout):
    """

    :type pos_pla: POSEvalDict
    """
    matches = 0
    compares = 0

    # Iterate through each instance in the corpus.
    for inst in xc:
        gold_tag_tier = lang_tag_tier(inst, INTENT_POS_MANUAL)

        # If there are no gold tags for this instance, skip it.
        if gold_tag_tier is None:
            continue

        # Create the eval tag tier and retrieve it
        tag_lang_pos(inst, e_tagger)
        eval_tag_tier = lang_tag_tier(inst, INTENT_POS_TAGGER)

        # For each gold tag...
        for gold_tag in gold_tag_tier:

            # Find it's matching tag on the eval side, and compare.
            eval_tag = xigt_find(eval_tag_tier, alignment=gold_tag.alignment)
            gold_tag_v = gold_tag.value()

            if gold_tag_v is not None:
                if gold_tagmap:
                    try:
                        gold_tag_v = gold_tagmap.get(gold_tag_v)
                    except TagMapException:
                        pass

                if gold_tag_v != 'JUNK':
                    if gold_tag_v == eval_tag.value():
                        matches += 1
                    compares += 1

                pos_pla.add(gold_tag.value(), eval_tag.value())
    return matches, compares
Example #2
0
def extract_tagger_from_instance(inst: Igt, output_stream, pos_source, tm):
    """
    Given an instance, retrieve the language-line words and POS tags.

    :param inst:
    :param output_stream:
    :param pos_source:
    """
    lang_pos_tags = lang_tag_tier(inst, tag_method=pos_source)
    lang_words     = lang(inst)

    training_sentences = 0

    # -------------------------------------------
    # Only try extracting if there are in fact valid POS tags.
    # -------------------------------------------
    if lang_pos_tags:

        first = True
        for lang_word in lang_words:

            lang_pos_tag = None
            if lang_pos_tags is not None:
                lang_pos_tag = xigt_find(lang_pos_tags, alignment=lang_word.id)

            tag_string = lang_pos_tag.value() if lang_pos_tag is not None else handle_unknown_pos(inst, lang_word)
            if tag_string and tm:
                tag_string = tm[tag_string]

            word_string = lang_word.value()

            # -------------------------------------------
            # Do some cleaning on the output words
            # -------------------------------------------
            word_string = clean_lang_token(word_string, lowercase=True)

            # For every instance after the first,
            # add a space.
            out_str = ' {}/{}'
            if first:
                first = False
                out_str = out_str.strip()

            output_stream.write(out_str.format(word_string, tag_string))
        output_stream.write('\n')
        output_stream.flush()
        training_sentences += 1

    return training_sentences
Example #3
0
def gather_gloss_pos_stats(inst, subword_dict, feat_list):
    """
    Given an instance, look for the gloss pos tags, and save the statistics
    about them, so that we can filter by the number of times each kind was
    seen later.

    :param inst: Instance to process.
    :type inst: RGIgt
    :param subword_dict: This dictionary will record the number of times each (word, TAG)
                          pair has been seen.
    :type subword_dict: SubwordDict
    :param gram_tag_dict: This dictionary will record the number of times individual grams are seen.
    :type gram_tag_dict: TwoLevelCountDict
    """

    # Grab the gloss POS tier...
    gpos_tier = gloss_tag_tier(inst)
    lpos_tier = lang_tag_tier(inst)
    gw_tier = gloss(inst)

    if CLASS_FEATS_ALN in feat_list:
        heur_align_inst(inst)
        get_trans_glosses_alignment(inst, aln_method=INTENT_ALN_HEUR)

    # If there are POS tags on the language line but not the gloss line...
    if gpos_tier is None and lpos_tier is not None:
        add_gloss_lang_alignments(inst)
        project_lang_to_gloss(inst)
        gpos_tier = gloss_tag_tier(inst)


    # If this tier exists, then let's process it.
    if gpos_tier is not None:

        # Iterate over each gloss POS tag...
        for i, gw in enumerate(gw_tier):
            tag = xigt_find(inst, alignment=gw.id)

            if tag is None:
                continue

            prev_word = gw_tier[i-1].value().lower() if i > 0 else None
            next_word = gw_tier[i+1].value().lower() if i < len(gw_tier)-1 else None

            if CLASS_FEATS_ALN in feat_list:
                subword_dict.add_word_tag(gw.value().lower(), tag.value(), prev_word, next_word)