コード例 #1
0
ファイル: filter.py プロジェクト: rgeorgi/intent
def filter_xc(xc, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0, prev_good_instances=0):

    new_corp = XigtCorpus()

    examined = 0
    failures = 0
    successes= 0

    my_filter = ''

    for inst in xc:
        examined += 1
        assert isinstance(inst, Igt)

        def fail(reason):
            nonlocal failures, my_filter
            my_filter = filter_string(inst).format("FAIL", '['+reason+']')
            failures += 1
            FILTER_LOG.info(my_filter)

        def success():
            nonlocal successes, my_filter
            my_filter = filter_string(inst).format("SUCCESS", "")
            successes += 1


        def trytier(f):
            try:
                result = f(inst)
            except (NoNormLineException) as nnle:
                return None
                fail("Bad Lines")
            else:
                return result


        lt = trytier(lang)
        gt = trytier(gloss)
        tt = trytier(trans)


        if require_lang  and lt is None:
            fail("LANG")
            continue
        if require_gloss and gt is None:
            fail("GLOSS")
            continue
        if require_trans and tt is None:
            fail("TRANS")
            continue
        if require_aln:

            if gt is None:
                fail("ALIGN-GLOSS")
                continue
            if lt is None:
                fail("ALIGN-LANG")
                continue

            try:
                word_align(gt, lt)
            except GlossLangAlignException:
                fail("ALIGN")
                continue

        if require_grammatical:
            if lt:
                grammatical_ll = [l for l in lang_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)]
            if gt:
                grammatical_gl = gloss_line(inst).get_attribute(ODIN_JUDGMENT_ATTRIBUTE)
            if tt:
                grammatical_tl = [l for l in trans_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)]

            if grammatical_ll or grammatical_gl or grammatical_tl:
                fail("UNGRAMMATICAL")
                continue



        if require_gloss_pos:
            if pos_tag_tier(inst, gt.id) is None:
                fail("GLOSS_POS")
                continue

        # Otherwise, attach to the new corpus.
        new_corp.append(inst)

        success()
        FILTER_LOG.info(my_filter)
        inst.sort_tiers()

        # -------------------------------------------
        # Break out of the loop if we've hit the maximum
        # number of good instances.
        # -------------------------------------------
        if max_instances and prev_good_instances+successes >= max_instances:
            break

    return new_corp, examined, successes, failures
コード例 #2
0
ファイル: parsing.py プロジェクト: rgeorgi/intent
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None):
    """
    Read in a odin-style textfile to create the xigt corpus.

    """
    # Initialize the corpus
    xc = XigtCorpus()

    # Replace invalid characters...
    data = replace_invalid_xml(text)

    # Read all the text lines
    inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data)

    #=======================================================================
    # Begin parsing...
    #=======================================================================

    parsed = 0
    PARSELOG.info('Beginning parse')
    for inst_num, inst_txt in enumerate(inst_txts):

        if parsed % 250 == 0:
            PARSELOG.info('Parsing instance %d...' % parsed)
            pass

        # Handle the requirement for 1_to_1 alignment.
        try:
            i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num)
        except GlossLangAlignException as glae:
            PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num))
            continue

        # Try to get the translation line. ---------------------------------
        try:
            hastrans = trans_lines(i)
        except NoTransLineException as ntle:
            PARSELOG.info(ntle)
            hastrans = False

        # Try to get the gloss line. --------------------------------------
        try:
            hasgloss = i.gloss
        except NoGlossLineException as ngle:
            PARSELOG.info(ngle)
            hasgloss = False

        # Try to get the language line. ------------------------------------
        try:
            haslang = i.lang
        except NoLangLineException as nlle:
            PARSELOG.info(nlle)
            haslang = False


        parsed +=1


        trans_constraint = (hastrans and require_trans) or (not require_trans)
        gloss_constraint = (hasgloss and require_gloss) or (not require_gloss)
        lang_constraint  = (haslang  and require_lang)  or (not require_lang)

        if trans_constraint and gloss_constraint and lang_constraint:
            xc.append(i)
        else:
            PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id)

        # If we have reached the limit of instances that have been requested,
        # stop processing.
        if limit is not None and limit == parsed: break



    # Return the corpus
    return xc