Exemple #1
0
def raw_txt_to_xc(txt):
    """

    :rtype: XigtCorpus
    """
    print("Creating XIGT corpus from raw text...")
    xc = XigtCorpus()

    PARSELOG.debug("Replacing invalid XML...")
    data = replace_invalid_xml(txt)

    instances = []
    cur_lines = []

    for line in data.split('\n'):

        if not line.strip():

            instances.append('\n'.join(cur_lines))
            cur_lines = []
            continue
        else:
            cur_lines.append(line)

    if cur_lines:
        instances.append('\n'.join(cur_lines))

    for instance in instances:
        i = raw_txt_to_inst(instance, corpus=xc)
        xc.append(i)


    print("{} instances parsed.".format(len(xc)))
    return xc
Exemple #2
0
def corpus_to_xigt(corp: Corpus):
    """
    Given an INTENT2 Corpus object,
    return its representation in xigtxml format.
    """
    xc = XigtCorpus()
    EXPORT_LOG.info('Preparing to export INTENT2 Coprus to Xigt')
    for inst in corp:
        xigt_inst = instance_to_xigt(inst)
        try:
            dumps(XigtCorpus(igts=[xigt_inst]))
            xc.append(xigt_inst)
        except (TypeError, XigtError) as te:
            EXPORT_LOG.error('Error in serializing instance "{}": {}'.format(
                inst.id, te))
    EXPORT_LOG.info(
        'Corpus successfully converted. Returning string for writing.')
    return dumps(xc)
Exemple #3
0
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None):
    """
    Read in a odin-style textfile to create the xigt corpus.

    """
    # Initialize the corpus
    xc = XigtCorpus()

    # Replace invalid characters...
    data = replace_invalid_xml(text)

    # Read all the text lines
    inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data)

    #=======================================================================
    # Begin parsing...
    #=======================================================================

    parsed = 0
    PARSELOG.info('Beginning parse')
    for inst_num, inst_txt in enumerate(inst_txts):

        if parsed % 250 == 0:
            PARSELOG.info('Parsing instance %d...' % parsed)
            pass

        # Handle the requirement for 1_to_1 alignment.
        try:
            i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num)
        except GlossLangAlignException as glae:
            PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num))
            continue

        # Try to get the translation line. ---------------------------------
        try:
            hastrans = trans_lines(i)
        except NoTransLineException as ntle:
            PARSELOG.info(ntle)
            hastrans = False

        # Try to get the gloss line. --------------------------------------
        try:
            hasgloss = i.gloss
        except NoGlossLineException as ngle:
            PARSELOG.info(ngle)
            hasgloss = False

        # Try to get the language line. ------------------------------------
        try:
            haslang = i.lang
        except NoLangLineException as nlle:
            PARSELOG.info(nlle)
            haslang = False


        parsed +=1


        trans_constraint = (hastrans and require_trans) or (not require_trans)
        gloss_constraint = (hasgloss and require_gloss) or (not require_gloss)
        lang_constraint  = (haslang  and require_lang)  or (not require_lang)

        if trans_constraint and gloss_constraint and lang_constraint:
            xc.append(i)
        else:
            PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id)

        # If we have reached the limit of instances that have been requested,
        # stop processing.
        if limit is not None and limit == parsed: break



    # Return the corpus
    return xc
Exemple #4
0
def convert_pml(aln_path, out_path, hindi=True):

    if hindi:
        igt_data = retrieve_hindi()
    else:
        igt_data = retrieve_naacl()

    a_root = load_xml(aln_path)
    doc_a  = a_root.find(".//reffile[@name='document_a']").get('href')
    doc_b  = a_root.find(".//reffile[@name='document_b']").get('href')



    doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a))
    doc_b  = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b))

    # Load the sentences for each document.
    a_sents, a_glossed = load_sents(doc_a)
    b_sents, b_glossed = load_sents(doc_b)



    sent_alignments = a_root.findall(".//body/LM")

    assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses"

    xc = XigtCorpus()

    for sent_alignment in sent_alignments:

        # Get the sentence id...
        aln_id = sent_alignment.attrib.get('id')
        a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1)
        if a_snt_id not in igt_data:
            continue

        # Get the text and tokens from the naacl data.
        pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id]
        lang_tokens = lang_txt.split()
        gloss_tokens = gloss_txt.split()
        trans_tokens = trans_txt.split()

        a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1]
        b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1]

        word_alignments = sent_alignment.findall('./node_alignments/LM')

        a_snt, a_edges = a_sents[a_snt_ref]
        b_snt, b_edges = b_sents[b_snt_ref]

        assert isinstance(a_snt, Sentence)
        assert isinstance(b_snt, Sentence)
        # -------------------------------------------
        # Skip sentences if they are not found for whatever reason
        # -------------------------------------------
        if not a_snt or not b_snt:
            continue

        # -------------------------------------------
        # Start constructing the IGT Instance.
        # -------------------------------------------

        trans_snt, trans_indices = a_snt, a_edges
        gloss_snt, gloss_indices = b_snt, b_edges
        if a_glossed:
            trans_snt, trans_indices = b_snt, b_edges
            gloss_snt, gloss_indices = a_snt, a_edges

        # Hindi stuff...
        if hindi:
            lang_tokens = [w.text for w in gloss_snt]
            lang_postags   = [w.pos  for w in gloss_snt]
            lang_txt    = ' '.join(lang_tokens)

            trans_tokens = [w.text for w in trans_snt]
            trans_postags   = [w.pos  for w in trans_snt]
            trans_txt    = ' '.join(trans_tokens)

            gloss_tokens  = [w.gloss if w.gloss else 'NULL' for w in gloss_snt]
            gloss_postags = lang_postags
            gloss_txt     = ' '.join(gloss_tokens)



        inst = Igt(id=re.sub('s-', 'igt', a_snt_ref))
        nt   = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE})
        ll   = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt)
        gl   = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt)
        tl   = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt)
        nt.extend([ll,gl,tl])
        inst.append(nt)


        # -------------------------------------------
        # Handle the phrase tiers
        # -------------------------------------------
        generate_lang_phrase_tier(inst)
        generate_trans_phrase_tier(inst)

        def process_postags(sent, tokens):
            postags = []
            for i, token in enumerate(tokens):
                word = sent.getorder(i+1)
                if word is None:
                    postags.append(None)
                else:
                    postags.append(word.pos)
            return postags

        # -------------------------------------------
        # Now, handle the translation words.
        # -------------------------------------------
        tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0])
        inst.append(tt)

        if not hindi:
            trans_postags = process_postags(trans_snt, trans_tokens)

        add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL)


        # -------------------------------------------
        # Handle the words tiers...
        # -------------------------------------------
        wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0])
        gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl)
        inst.extend([wt, gwt])
        # Quickly set the alignment for the gloss words.
        for w, gw in zip(wt, gwt):
            gw.alignment = w.id


        if not hindi:
            lang_postags = process_postags(gloss_snt, gloss_tokens)
            gloss_postags = lang_postags

        add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL)
        add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL)

        create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL)
        create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL)



        # -------------------------------------------
        # Now, the word alignments.
        # -------------------------------------------
        a = Alignment()
        for word_alignment in word_alignments:
            a_ref = word_alignment.find('./a.rf').text.split('#')[1]
            b_ref = word_alignment.find('./b.rf').text.split('#')[1]

            a_word = a_snt.getid(a_ref)
            b_word = b_snt.getid(b_ref)

            if a_word is None or b_word is None:
                continue

            if not hindi:
                a_idx  = a_word.order
                b_idx  = b_word.order
            else:
                a_idx  = a_snt.index(a_word)+1
                b_idx  = b_snt.index(b_word)+1

            # Make sure the gloss is in the
            if a_glossed:
                trans_idx = b_idx
                lang_idx  = a_idx
            else:
                trans_idx = a_idx
                lang_idx  = b_idx

            a.add((trans_idx, lang_idx))


        set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL)
        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL)

        xc.append(inst)

    with open(out_path, 'w', encoding='utf-8') as f:
        xigtxml.dump(f, xc)