def raw_txt_to_xc(txt): """ :rtype: XigtCorpus """ print("Creating XIGT corpus from raw text...") xc = XigtCorpus() PARSELOG.debug("Replacing invalid XML...") data = replace_invalid_xml(txt) instances = [] cur_lines = [] for line in data.split('\n'): if not line.strip(): instances.append('\n'.join(cur_lines)) cur_lines = [] continue else: cur_lines.append(line) if cur_lines: instances.append('\n'.join(cur_lines)) for instance in instances: i = raw_txt_to_inst(instance, corpus=xc) xc.append(i) print("{} instances parsed.".format(len(xc))) return xc
def corpus_to_xigt(corp: Corpus): """ Given an INTENT2 Corpus object, return its representation in xigtxml format. """ xc = XigtCorpus() EXPORT_LOG.info('Preparing to export INTENT2 Coprus to Xigt') for inst in corp: xigt_inst = instance_to_xigt(inst) try: dumps(XigtCorpus(igts=[xigt_inst])) xc.append(xigt_inst) except (TypeError, XigtError) as te: EXPORT_LOG.error('Error in serializing instance "{}": {}'.format( inst.id, te)) EXPORT_LOG.info( 'Corpus successfully converted. Returning string for writing.') return dumps(xc)
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None): """ Read in a odin-style textfile to create the xigt corpus. """ # Initialize the corpus xc = XigtCorpus() # Replace invalid characters... data = replace_invalid_xml(text) # Read all the text lines inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data) #======================================================================= # Begin parsing... #======================================================================= parsed = 0 PARSELOG.info('Beginning parse') for inst_num, inst_txt in enumerate(inst_txts): if parsed % 250 == 0: PARSELOG.info('Parsing instance %d...' % parsed) pass # Handle the requirement for 1_to_1 alignment. try: i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num) except GlossLangAlignException as glae: PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num)) continue # Try to get the translation line. --------------------------------- try: hastrans = trans_lines(i) except NoTransLineException as ntle: PARSELOG.info(ntle) hastrans = False # Try to get the gloss line. -------------------------------------- try: hasgloss = i.gloss except NoGlossLineException as ngle: PARSELOG.info(ngle) hasgloss = False # Try to get the language line. ------------------------------------ try: haslang = i.lang except NoLangLineException as nlle: PARSELOG.info(nlle) haslang = False parsed +=1 trans_constraint = (hastrans and require_trans) or (not require_trans) gloss_constraint = (hasgloss and require_gloss) or (not require_gloss) lang_constraint = (haslang and require_lang) or (not require_lang) if trans_constraint and gloss_constraint and lang_constraint: xc.append(i) else: PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id) # If we have reached the limit of instances that have been requested, # stop processing. if limit is not None and limit == parsed: break # Return the corpus return xc
def convert_pml(aln_path, out_path, hindi=True): if hindi: igt_data = retrieve_hindi() else: igt_data = retrieve_naacl() a_root = load_xml(aln_path) doc_a = a_root.find(".//reffile[@name='document_a']").get('href') doc_b = a_root.find(".//reffile[@name='document_b']").get('href') doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a)) doc_b = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b)) # Load the sentences for each document. a_sents, a_glossed = load_sents(doc_a) b_sents, b_glossed = load_sents(doc_b) sent_alignments = a_root.findall(".//body/LM") assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses" xc = XigtCorpus() for sent_alignment in sent_alignments: # Get the sentence id... aln_id = sent_alignment.attrib.get('id') a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1) if a_snt_id not in igt_data: continue # Get the text and tokens from the naacl data. pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id] lang_tokens = lang_txt.split() gloss_tokens = gloss_txt.split() trans_tokens = trans_txt.split() a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1] b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1] word_alignments = sent_alignment.findall('./node_alignments/LM') a_snt, a_edges = a_sents[a_snt_ref] b_snt, b_edges = b_sents[b_snt_ref] assert isinstance(a_snt, Sentence) assert isinstance(b_snt, Sentence) # ------------------------------------------- # Skip sentences if they are not found for whatever reason # ------------------------------------------- if not a_snt or not b_snt: continue # ------------------------------------------- # Start constructing the IGT Instance. # ------------------------------------------- trans_snt, trans_indices = a_snt, a_edges gloss_snt, gloss_indices = b_snt, b_edges if a_glossed: trans_snt, trans_indices = b_snt, b_edges gloss_snt, gloss_indices = a_snt, a_edges # Hindi stuff... if hindi: lang_tokens = [w.text for w in gloss_snt] lang_postags = [w.pos for w in gloss_snt] lang_txt = ' '.join(lang_tokens) trans_tokens = [w.text for w in trans_snt] trans_postags = [w.pos for w in trans_snt] trans_txt = ' '.join(trans_tokens) gloss_tokens = [w.gloss if w.gloss else 'NULL' for w in gloss_snt] gloss_postags = lang_postags gloss_txt = ' '.join(gloss_tokens) inst = Igt(id=re.sub('s-', 'igt', a_snt_ref)) nt = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE}) ll = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt) gl = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt) tl = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt) nt.extend([ll,gl,tl]) inst.append(nt) # ------------------------------------------- # Handle the phrase tiers # ------------------------------------------- generate_lang_phrase_tier(inst) generate_trans_phrase_tier(inst) def process_postags(sent, tokens): postags = [] for i, token in enumerate(tokens): word = sent.getorder(i+1) if word is None: postags.append(None) else: postags.append(word.pos) return postags # ------------------------------------------- # Now, handle the translation words. # ------------------------------------------- tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0]) inst.append(tt) if not hindi: trans_postags = process_postags(trans_snt, trans_tokens) add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL) # ------------------------------------------- # Handle the words tiers... # ------------------------------------------- wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0]) gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl) inst.extend([wt, gwt]) # Quickly set the alignment for the gloss words. for w, gw in zip(wt, gwt): gw.alignment = w.id if not hindi: lang_postags = process_postags(gloss_snt, gloss_tokens) gloss_postags = lang_postags add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL) add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL) create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL) create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL) # ------------------------------------------- # Now, the word alignments. # ------------------------------------------- a = Alignment() for word_alignment in word_alignments: a_ref = word_alignment.find('./a.rf').text.split('#')[1] b_ref = word_alignment.find('./b.rf').text.split('#')[1] a_word = a_snt.getid(a_ref) b_word = b_snt.getid(b_ref) if a_word is None or b_word is None: continue if not hindi: a_idx = a_word.order b_idx = b_word.order else: a_idx = a_snt.index(a_word)+1 b_idx = b_snt.index(b_word)+1 # Make sure the gloss is in the if a_glossed: trans_idx = b_idx lang_idx = a_idx else: trans_idx = a_idx lang_idx = b_idx a.add((trans_idx, lang_idx)) set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL) set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL) xc.append(inst) with open(out_path, 'w', encoding='utf-8') as f: xigtxml.dump(f, xc)