Example #1
0
def evaluate_instance(inst, classifier, tagger):
    # Get the supervised POS tags...
    """

    :param inst:
    :type inst: RGIgt
    :param classifier: MalletMaxent
    :param tagger: StanfordPOSTagger
    """
    sup_gloss_tier = pos_tag_tier(inst, GLOSS_WORD_ID)  # We will incrementally build up the tag sequences...
    sup_lang_tier  = pos_tag_tier(inst, LANG_WORD_ID)


    sup_tags = []
    prj_tags = []
    cls_tags = []

    # If there are no supervised tags on the gloss line, but there are on the language line...
    if sup_gloss_tier is None and sup_lang_tier is not None:
        try:
            add_gloss_lang_alignments(inst)
            project_lang_to_gloss(inst)
            sup_gloss_tier = pos_tag_tier(inst, GLOSS_WORD_ID)
        except RGXigtException:
            pass

    if sup_gloss_tier:

        # Do the classification
        classify_gloss_pos(inst, classifier)
        cls_tier = pos_tag_tier(inst, GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)

        for sup_item in sup_gloss_tier:
            word = xigt_find(inst, id=sup_item.alignment)
            if not word:
                continue
            else:
                word = word.value()

            # prj_item = xigt_find(prj_tier, alignment=sup_item.alignment)
            # if prj_item is None:
            #     prj_tag = 'UNK'
            # else:
            #     prj_tag = prj_item.value()

            cls_item = xigt_find(cls_tier, alignment=sup_item.alignment)
            if cls_item is None:
                cls_tag = 'UNK'
            else:
                cls_tag = cls_item.value()

            sup_tags.append(POSToken(word, label=sup_item.value()))
            # prj_tags.append(POSToken(word, label=prj_tag))
            cls_tags.append(POSToken(word, label=cls_tag))

    return sup_tags, cls_tags
Example #2
0
def gather_gloss_pos_stats(inst, subword_dict, feat_list):
    """
    Given an instance, look for the gloss pos tags, and save the statistics
    about them, so that we can filter by the number of times each kind was
    seen later.

    :param inst: Instance to process.
    :type inst: RGIgt
    :param subword_dict: This dictionary will record the number of times each (word, TAG)
                          pair has been seen.
    :type subword_dict: SubwordDict
    :param gram_tag_dict: This dictionary will record the number of times individual grams are seen.
    :type gram_tag_dict: TwoLevelCountDict
    """

    # Grab the gloss POS tier...
    gpos_tier = gloss_tag_tier(inst)
    lpos_tier = lang_tag_tier(inst)
    gw_tier = gloss(inst)

    if CLASS_FEATS_ALN in feat_list:
        heur_align_inst(inst)
        get_trans_glosses_alignment(inst, aln_method=INTENT_ALN_HEUR)

    # If there are POS tags on the language line but not the gloss line...
    if gpos_tier is None and lpos_tier is not None:
        add_gloss_lang_alignments(inst)
        project_lang_to_gloss(inst)
        gpos_tier = gloss_tag_tier(inst)


    # If this tier exists, then let's process it.
    if gpos_tier is not None:

        # Iterate over each gloss POS tag...
        for i, gw in enumerate(gw_tier):
            tag = xigt_find(inst, alignment=gw.id)

            if tag is None:
                continue

            prev_word = gw_tier[i-1].value().lower() if i > 0 else None
            next_word = gw_tier[i+1].value().lower() if i < len(gw_tier)-1 else None

            if CLASS_FEATS_ALN in feat_list:
                subword_dict.add_word_tag(gw.value().lower(), tag.value(), prev_word, next_word)