Example #1
0
def extract_tagger_from_instance(inst: Igt, output_stream, pos_source, tm):
    """
    Given an instance, retrieve the language-line words and POS tags.

    :param inst:
    :param output_stream:
    :param pos_source:
    """
    lang_pos_tags = lang_tag_tier(inst, tag_method=pos_source)
    lang_words     = lang(inst)

    training_sentences = 0

    # -------------------------------------------
    # Only try extracting if there are in fact valid POS tags.
    # -------------------------------------------
    if lang_pos_tags:

        first = True
        for lang_word in lang_words:

            lang_pos_tag = None
            if lang_pos_tags is not None:
                lang_pos_tag = xigt_find(lang_pos_tags, alignment=lang_word.id)

            tag_string = lang_pos_tag.value() if lang_pos_tag is not None else handle_unknown_pos(inst, lang_word)
            if tag_string and tm:
                tag_string = tm[tag_string]

            word_string = lang_word.value()

            # -------------------------------------------
            # Do some cleaning on the output words
            # -------------------------------------------
            word_string = clean_lang_token(word_string, lowercase=True)

            # For every instance after the first,
            # add a space.
            out_str = ' {}/{}'
            if first:
                first = False
                out_str = out_str.strip()

            output_stream.write(out_str.format(word_string, tag_string))
        output_stream.write('\n')
        output_stream.flush()
        training_sentences += 1

    return training_sentences
Example #2
0
def extract_sents_from_inst(inst: Igt, out_src, out_tgt, aln_method=None, no_alignment_heur = True, sent_type=SENT_TYPE_T_G):
    """
    Extract parallel sentences from an instance. Either:

    1) Translation--Gloss
    2) Translation--Language
    """

    # -------------------------------------------
    # 1) Get the source string (translation)
    # -------------------------------------------
    src_str = tier_text(trans(inst), remove_whitespace_inside_tokens=True).lower()


    # -------------------------------------------
    # 2) Decide whether the target string is gloss or language.
    # -------------------------------------------
    if sent_type == SENT_TYPE_T_L:
        tgt_str = tier_text(lang(inst), remove_whitespace_inside_tokens=True).lower()
    elif sent_type == SENT_TYPE_T_G:
        tgt_str = tier_text(gloss(inst), remove_whitespace_inside_tokens=True).lower()
    else:
        raise Exception("Invalid sent type")

    # -------------------------------------------
    # 3) Write the choice out to disk.
    # -------------------------------------------
    out_src.write(src_str + '\n')
    out_tgt.write(tgt_str + '\n')
    out_src.flush()
    out_tgt.flush()

    # -------------------------------------------
    # 4) Add heuristic alignments, if asked for.
    # -------------------------------------------
    if not no_alignment_heur:

        pairs = get_trans_aligned_wordpairs(inst, aln_method=aln_method, add_align=True, sent_type=sent_type)
        for src_word, tgt_word in pairs:
            out_src.write(src_word.lower() + '\n')
            out_tgt.write(tgt_word.lower() + '\n')
Example #3
0
def extract_parser_from_instance(inst: Igt, output_stream, pos_source, tm):
    """
    Given an IGT instance, extract the projected dependency structure from
    it (along with the POS tags from the given pos_source)

    :param inst: Input instance
    :param output_stream: The output stream to write the training data to.
    """
    extracted = 0
    try:
        ds = get_lang_ds(inst, pos_source=pos_source, unk_pos_handling=None)
        if ds is not None:
            conll_string = to_conll(ds, lang(inst), lowercase=True, match_punc=True, clean_token=True, unk_pos='UNK', tagmap=tm)
            output_stream.write(conll_string+'\n\n')
            output_stream.flush()
            extracted += 1

    except RuntimeError as re:
        print(re)
        EXTRACT_LOG.error("Runtime error in instance {}".format(inst.id))
    except RGXigtException as rgxe:
        EXTRACT_LOG.warn('Instance "{}" failed with "{}"'.format(inst.id, rgxe))

    return extracted