def filter_xc(xc, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0, prev_good_instances=0): new_corp = XigtCorpus() examined = 0 failures = 0 successes= 0 my_filter = '' for inst in xc: examined += 1 assert isinstance(inst, Igt) def fail(reason): nonlocal failures, my_filter my_filter = filter_string(inst).format("FAIL", '['+reason+']') failures += 1 FILTER_LOG.info(my_filter) def success(): nonlocal successes, my_filter my_filter = filter_string(inst).format("SUCCESS", "") successes += 1 def trytier(f): try: result = f(inst) except (NoNormLineException) as nnle: return None fail("Bad Lines") else: return result lt = trytier(lang) gt = trytier(gloss) tt = trytier(trans) if require_lang and lt is None: fail("LANG") continue if require_gloss and gt is None: fail("GLOSS") continue if require_trans and tt is None: fail("TRANS") continue if require_aln: if gt is None: fail("ALIGN-GLOSS") continue if lt is None: fail("ALIGN-LANG") continue try: word_align(gt, lt) except GlossLangAlignException: fail("ALIGN") continue if require_grammatical: if lt: grammatical_ll = [l for l in lang_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)] if gt: grammatical_gl = gloss_line(inst).get_attribute(ODIN_JUDGMENT_ATTRIBUTE) if tt: grammatical_tl = [l for l in trans_lines(inst) if l.get_attribute(ODIN_JUDGMENT_ATTRIBUTE)] if grammatical_ll or grammatical_gl or grammatical_tl: fail("UNGRAMMATICAL") continue if require_gloss_pos: if pos_tag_tier(inst, gt.id) is None: fail("GLOSS_POS") continue # Otherwise, attach to the new corpus. new_corp.append(inst) success() FILTER_LOG.info(my_filter) inst.sort_tiers() # ------------------------------------------- # Break out of the loop if we've hit the maximum # number of good instances. # ------------------------------------------- if max_instances and prev_good_instances+successes >= max_instances: break return new_corp, examined, successes, failures
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None): """ Read in a odin-style textfile to create the xigt corpus. """ # Initialize the corpus xc = XigtCorpus() # Replace invalid characters... data = replace_invalid_xml(text) # Read all the text lines inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data) #======================================================================= # Begin parsing... #======================================================================= parsed = 0 PARSELOG.info('Beginning parse') for inst_num, inst_txt in enumerate(inst_txts): if parsed % 250 == 0: PARSELOG.info('Parsing instance %d...' % parsed) pass # Handle the requirement for 1_to_1 alignment. try: i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num) except GlossLangAlignException as glae: PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num)) continue # Try to get the translation line. --------------------------------- try: hastrans = trans_lines(i) except NoTransLineException as ntle: PARSELOG.info(ntle) hastrans = False # Try to get the gloss line. -------------------------------------- try: hasgloss = i.gloss except NoGlossLineException as ngle: PARSELOG.info(ngle) hasgloss = False # Try to get the language line. ------------------------------------ try: haslang = i.lang except NoLangLineException as nlle: PARSELOG.info(nlle) haslang = False parsed +=1 trans_constraint = (hastrans and require_trans) or (not require_trans) gloss_constraint = (hasgloss and require_gloss) or (not require_gloss) lang_constraint = (haslang and require_lang) or (not require_lang) if trans_constraint and gloss_constraint and lang_constraint: xc.append(i) else: PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id) # If we have reached the limit of instances that have been requested, # stop processing. if limit is not None and limit == parsed: break # Return the corpus return xc