Example #1
0
def raw_txt_to_xc(txt):
    """

    :rtype: XigtCorpus
    """
    print("Creating XIGT corpus from raw text...")
    xc = XigtCorpus()

    PARSELOG.debug("Replacing invalid XML...")
    data = replace_invalid_xml(txt)

    instances = []
    cur_lines = []

    for line in data.split('\n'):

        if not line.strip():

            instances.append('\n'.join(cur_lines))
            cur_lines = []
            continue
        else:
            cur_lines.append(line)

    if cur_lines:
        instances.append('\n'.join(cur_lines))

    for instance in instances:
        i = raw_txt_to_inst(instance, corpus=xc)
        xc.append(i)


    print("{} instances parsed.".format(len(xc)))
    return xc
Example #2
0
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None):
    """
    Read in a odin-style textfile to create the xigt corpus.

    """
    # Initialize the corpus
    xc = XigtCorpus()

    # Replace invalid characters...
    data = replace_invalid_xml(text)

    # Read all the text lines
    inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data)

    #=======================================================================
    # Begin parsing...
    #=======================================================================

    parsed = 0
    PARSELOG.info('Beginning parse')
    for inst_num, inst_txt in enumerate(inst_txts):

        if parsed % 250 == 0:
            PARSELOG.info('Parsing instance %d...' % parsed)
            pass

        # Handle the requirement for 1_to_1 alignment.
        try:
            i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num)
        except GlossLangAlignException as glae:
            PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num))
            continue

        # Try to get the translation line. ---------------------------------
        try:
            hastrans = trans_lines(i)
        except NoTransLineException as ntle:
            PARSELOG.info(ntle)
            hastrans = False

        # Try to get the gloss line. --------------------------------------
        try:
            hasgloss = i.gloss
        except NoGlossLineException as ngle:
            PARSELOG.info(ngle)
            hasgloss = False

        # Try to get the language line. ------------------------------------
        try:
            haslang = i.lang
        except NoLangLineException as nlle:
            PARSELOG.info(nlle)
            haslang = False


        parsed +=1


        trans_constraint = (hastrans and require_trans) or (not require_trans)
        gloss_constraint = (hasgloss and require_gloss) or (not require_gloss)
        lang_constraint  = (haslang  and require_lang)  or (not require_lang)

        if trans_constraint and gloss_constraint and lang_constraint:
            xc.append(i)
        else:
            PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id)

        # If we have reached the limit of instances that have been requested,
        # stop processing.
        if limit is not None and limit == parsed: break



    # Return the corpus
    return xc