def test_ds_cycle(self): """ The tree in the ds_cycle file has "woman" depend both on "arriving" and "browse." """ xc = xc_load(ds_cycle) inst = xc[0] # 1 2 4 5 7 8 9 # The woman, (after) arriving, began to browse. # (The commas count as words, hence the skipping) tgt_t = DepTree.fromstring(""" (ROOT[0] (began[7] (woman[2] (The[1]) (\(after\)[4] (arriving[5]))) (browse[9] (woman[2]) (to[8]) ) )) """, stype=DEPSTR_PTB) ds = get_ds(inst, trans(inst)) self.assertTrue(tgt_t.structurally_eq(ds)) self.assertIsNone(project_ds_tier(inst))
def set_bilingual_align_test(self): """ Set the bilingual alignment manually, and ensure that it is read back correctly. """ a = Alignment([(1,1),(1,2),(2,8),(4,3),(5,7),(6,5)]) set_bilingual_alignment(self.igt, trans(self.igt), glosses(self.igt), a, INTENT_ALN_MANUAL) get_trans_glosses_alignment(self.igt, INTENT_ALN_MANUAL)
def test_read_proj_ds_tree(self): src_t = get_ds(self.inst2, trans(self.inst2)) tgt_w = lang(self.inst2) aln = get_trans_gloss_alignment(self.inst2) tgt_t = DepTree.fromstring(""" (ROOT[0] (glaubst[2] (Was[1]) (Du[3]) (wer[4]) (angerufen[5] (hat[6])) )) """, stype=DEPSTR_PTB) proj_t = project_ds(src_t, tgt_w, aln) self.assertTrue(proj_t.structurally_eq(tgt_t))
def extract_sents_from_inst(inst: Igt, out_src, out_tgt, aln_method=None, no_alignment_heur = True, sent_type=SENT_TYPE_T_G): """ Extract parallel sentences from an instance. Either: 1) Translation--Gloss 2) Translation--Language """ # ------------------------------------------- # 1) Get the source string (translation) # ------------------------------------------- src_str = tier_text(trans(inst), remove_whitespace_inside_tokens=True).lower() # ------------------------------------------- # 2) Decide whether the target string is gloss or language. # ------------------------------------------- if sent_type == SENT_TYPE_T_L: tgt_str = tier_text(lang(inst), remove_whitespace_inside_tokens=True).lower() elif sent_type == SENT_TYPE_T_G: tgt_str = tier_text(gloss(inst), remove_whitespace_inside_tokens=True).lower() else: raise Exception("Invalid sent type") # ------------------------------------------- # 3) Write the choice out to disk. # ------------------------------------------- out_src.write(src_str + '\n') out_tgt.write(tgt_str + '\n') out_src.flush() out_tgt.flush() # ------------------------------------------- # 4) Add heuristic alignments, if asked for. # ------------------------------------------- if not no_alignment_heur: pairs = get_trans_aligned_wordpairs(inst, aln_method=aln_method, add_align=True, sent_type=sent_type) for src_word, tgt_word in pairs: out_src.write(src_word.lower() + '\n') out_tgt.write(tgt_word.lower() + '\n')
def enrich(**kwargs): global classifier if ARG_OUTFILE not in kwargs: ENRICH_LOG.critical("No output file specified.") sys.exit() # ============================================================================= # Set up the alternate classifier path... # ============================================================================= class_path = kwargs.get('class_path') #=========================================================================== # Set up the different arguments... #=========================================================================== inpath = kwargs.get(ARG_INFILE) parse_args = kwargs.get(PARSE_VAR, []) pos_args = kwargs.get(POS_VAR, []) aln_args = kwargs.get(ALN_VAR, []) max_parse_length = kwargs.get('max_parse_length', 10) if not (parse_args or pos_args or aln_args): ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.") #=========================================================================== # Sanity check the arguments. #=========================================================================== # Check that alignment is asked for if projection is asked for. if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args): ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \ "alignments to be generated. Projection may fail if alignment not already present in file.") ENRICH_LOG.log(1000, 'Loading input file...') with open(inpath, 'r', encoding='utf-8') as in_f: corp = xigtxml.load(in_f, mode=INCREMENTAL) # ------------------------------------------- # Initialize the English tagger if: # A) "proj" option is selected for pos. # B) "trans" option is given for pos. # C) "heurpos" option is given for alignment. # ------------------------------------------- s = None if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, 'Initializing tagger...') tagger = c.getpath('stanford_tagger_trans') try: s = StanfordPOSTagger(tagger) except TaggerError as te: ENRICH_LOG.critical(te) sys.exit(2) # ------------------------------------------- # Initialize the parser if: # A) "trans" option is given for parse # B) "proj" option is given for parse. # ------------------------------------------- if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args: ENRICH_LOG.log(1000, "Intializing English parser...") sp = stanford_parser.StanfordParser() # ------------------------------------------- # Initialize the classifier if: # A) "class" option is given for pos # B) "heurpos" option is given for alignment. # ------------------------------------------- m = None if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, "Initializing gloss-line classifier...") p = load_posdict() m = mallet_maxent.MalletMaxent(classifier) # -- 1b) Giza Gloss to Translation alignment -------------------------------------- if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args: ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...') try: if ARG_ALN_GIZAHEUR in aln_args: giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) if ARG_ALN_GIZA in aln_args: giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) except GizaAlignmentException as gae: gl = logging.getLogger('giza') gl.critical(str(gae)) raise gae # ------------------------------------------- # Begin iterating through the corpus # ------------------------------------------- for inst in corp: feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id) reasons = [] inst_status = None def fail(reason): nonlocal inst_status, reasons if reason not in reasons: reasons.append(reason) inst_status = 'WARN' def success(): nonlocal inst_status inst_status = 'OK' # ------------------------------------------- # Define the reasons for failure # ------------------------------------------- F_GLOSS_LINE = "NOGLOSS" F_LANG_LINE = "NOLANG" F_TRANS_LINE = "NOTRANS" F_BAD_LINES = "BADLINES" F_L_G_ALN = "L_G_ALIGN" F_T_G_ALN = "G_T_ALIGN" F_NO_TRANS_POS="NO_POS_TRANS" F_PROJECTION = "PROJECTION" F_UNKNOWN = "UNKNOWN" F_PARSELEN = "OVER_MAX_LENGTH" try: # ------------------------------------------- # Get the different lines # ------------------------------------------- def tryline(func): nonlocal inst try: return func(inst) except NoNormLineException as nnle: return None gl = tryline(gloss_line) tls = tryline(trans_lines) lls = tryline(lang_lines) has_gl = gl is not None has_tl = tls is not None has_ll = lls is not None has_all = lambda: (has_gl and has_tl and has_ll) # ------------------------------------------- # Translation Line # ------------------------------------------- if has_tl: if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: try: tag_trans_pos(inst, s) except CriticalTaggerError as cte: ENRICH_LOG.critical(str(cte)) sys.exit(2) if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args: if len(trans(inst)) <= max_parse_length: parse_translation_line(inst, sp, pt=True, dt=True) else: fail(F_PARSELEN) # 4) POS tag the gloss line -------------------------------------------- if has_gl: if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: classify_gloss_pos(inst, m, posdict=p) # ------------------------------------------- # Try getting alignments. # ------------------------------------------- if has_gl and has_ll: try: add_gloss_lang_alignments(inst) except GlossLangAlignException as glae: fail(F_L_G_ALN) if has_gl and has_tl: if ARG_ALN_HEURPOS in aln_args: heur_align_inst(inst, use_pos=True) if ARG_ALN_HEUR in aln_args: heur_align_inst(inst, use_pos=False) # ------------------------------------------- # Now, do the necessary projection tasks. # ------------------------------------------- # Project the classifier tags... if has_ll and has_gl and ARG_POS_CLASS in pos_args: try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS) except GlossLangAlignException: fail(F_L_G_ALN) # ------------------------------------------- # Do the trans-to-lang projection... # ------------------------------------------- if has_all(): proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)] aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method) if not aln or len(aln) == 0: fail(F_T_G_ALN) else: # ------------------------------------------- # POS Projection # ------------------------------------------- if ARG_POS_PROJ in pos_args: trans_tags = trans_tag_tier(inst) if not trans_tags: fail(F_NO_TRANS_POS) else: project_trans_pos_to_gloss(inst) try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) except GlossLangAlignException as glae: fail(F_L_G_ALN) # ------------------------------------------- # Parse projection # ------------------------------------------- if ARG_PARSE_PROJ in parse_args: try: project_pt_tier(inst, proj_aln_method=proj_aln_method) except PhraseStructureProjectionException as pspe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) try: project_ds_tier(inst, proj_aln_method=proj_aln_method) except ProjectionException as pe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) # Sort the tiers... ---------------------------------------------------- inst.sort_tiers() except Exception as e: # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id)) ENRICH_LOG.debug(e) # raise(e) fail(F_UNKNOWN) if not reasons: success() ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons))) ENRICH_LOG.log(1000, 'Writing output file...') if hasattr(kwargs.get(ARG_OUTFILE), 'write'): xigtxml.dump(kwargs.get(ARG_OUTFILE), corp) else: xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp) ENRICH_LOG.log(1000, 'Done.') ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def line_test(self): """ Test that lines are rendered correctly. """ self.assertEqual(tier_text(gloss(self.igt)), 'I-Nom child-Dat rice-Acc eat-Caus-Pst-Dec') self.assertEqual(tier_text(trans(self.igt)), 'I made the child eat rice')
def convert_pml(aln_path, out_path, hindi=True): if hindi: igt_data = retrieve_hindi() else: igt_data = retrieve_naacl() a_root = load_xml(aln_path) doc_a = a_root.find(".//reffile[@name='document_a']").get('href') doc_b = a_root.find(".//reffile[@name='document_b']").get('href') doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a)) doc_b = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b)) # Load the sentences for each document. a_sents, a_glossed = load_sents(doc_a) b_sents, b_glossed = load_sents(doc_b) sent_alignments = a_root.findall(".//body/LM") assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses" xc = XigtCorpus() for sent_alignment in sent_alignments: # Get the sentence id... aln_id = sent_alignment.attrib.get('id') a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1) if a_snt_id not in igt_data: continue # Get the text and tokens from the naacl data. pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id] lang_tokens = lang_txt.split() gloss_tokens = gloss_txt.split() trans_tokens = trans_txt.split() a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1] b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1] word_alignments = sent_alignment.findall('./node_alignments/LM') a_snt, a_edges = a_sents[a_snt_ref] b_snt, b_edges = b_sents[b_snt_ref] assert isinstance(a_snt, Sentence) assert isinstance(b_snt, Sentence) # ------------------------------------------- # Skip sentences if they are not found for whatever reason # ------------------------------------------- if not a_snt or not b_snt: continue # ------------------------------------------- # Start constructing the IGT Instance. # ------------------------------------------- trans_snt, trans_indices = a_snt, a_edges gloss_snt, gloss_indices = b_snt, b_edges if a_glossed: trans_snt, trans_indices = b_snt, b_edges gloss_snt, gloss_indices = a_snt, a_edges # Hindi stuff... if hindi: lang_tokens = [w.text for w in gloss_snt] lang_postags = [w.pos for w in gloss_snt] lang_txt = ' '.join(lang_tokens) trans_tokens = [w.text for w in trans_snt] trans_postags = [w.pos for w in trans_snt] trans_txt = ' '.join(trans_tokens) gloss_tokens = [w.gloss if w.gloss else 'NULL' for w in gloss_snt] gloss_postags = lang_postags gloss_txt = ' '.join(gloss_tokens) inst = Igt(id=re.sub('s-', 'igt', a_snt_ref)) nt = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE}) ll = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt) gl = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt) tl = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt) nt.extend([ll,gl,tl]) inst.append(nt) # ------------------------------------------- # Handle the phrase tiers # ------------------------------------------- generate_lang_phrase_tier(inst) generate_trans_phrase_tier(inst) def process_postags(sent, tokens): postags = [] for i, token in enumerate(tokens): word = sent.getorder(i+1) if word is None: postags.append(None) else: postags.append(word.pos) return postags # ------------------------------------------- # Now, handle the translation words. # ------------------------------------------- tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0]) inst.append(tt) if not hindi: trans_postags = process_postags(trans_snt, trans_tokens) add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL) # ------------------------------------------- # Handle the words tiers... # ------------------------------------------- wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0]) gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl) inst.extend([wt, gwt]) # Quickly set the alignment for the gloss words. for w, gw in zip(wt, gwt): gw.alignment = w.id if not hindi: lang_postags = process_postags(gloss_snt, gloss_tokens) gloss_postags = lang_postags add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL) add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL) create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL) create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL) # ------------------------------------------- # Now, the word alignments. # ------------------------------------------- a = Alignment() for word_alignment in word_alignments: a_ref = word_alignment.find('./a.rf').text.split('#')[1] b_ref = word_alignment.find('./b.rf').text.split('#')[1] a_word = a_snt.getid(a_ref) b_word = b_snt.getid(b_ref) if a_word is None or b_word is None: continue if not hindi: a_idx = a_word.order b_idx = b_word.order else: a_idx = a_snt.index(a_word)+1 b_idx = b_snt.index(b_word)+1 # Make sure the gloss is in the if a_glossed: trans_idx = b_idx lang_idx = a_idx else: trans_idx = a_idx lang_idx = b_idx a.add((trans_idx, lang_idx)) set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL) set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL) xc.append(inst) with open(out_path, 'w', encoding='utf-8') as f: xigtxml.dump(f, xc)
def test_read_ds_tree(self): ds = get_ds(self.inst1, trans(self.inst1)) r = DepTree.fromstring("""(ROOT[0] (found[2] (Someone[1]) (them[3]) (boring[4])))""", stype=DEPSTR_PTB) self.assertTrue(r.structurally_eq(ds))