def create_words_tier(cur_item, word_id, word_type, aln_attribute = SEGMENTATION, tokenizer=sentence_tokenizer): """ Create a words tier from an ODIN line type item. :param cur_item: Either a phrase item or a line item to tokenize and create words form. :type cur_item: RGItem :param word_id: The ID for this tier. :type word_id: str :param word_type: Tier type for this tier. :type word_type: str :rtype: RGWordTier """ # For the edge case in which the gloss line is defined, but empty. if cur_item.value() is None or not cur_item.value().strip(): words = [] else: # Tokenize the words in this phrase... words = tokenize_item(cur_item, tokenizer=tokenizer) # Create a new word tier to hold the tokenized words... wt = Tier(id = word_id, type=word_type, attributes={aln_attribute:cur_item.tier.id}, igt=cur_item.igt) for w in words: # Create a new word that is a segmentation of this tier. rw = Item(id=gen_item_id(wt.id, len(wt)), attributes={aln_attribute:create_aln_expr(cur_item.id, w.start, w.stop)}, tier=wt) wt.append(rw) return wt
def generate_phrase_tier(inst, tag, id, type) -> Tier: """ Retrieve a phrase for the given tag, with the provided id and type. """ f = lambda x: tag in odin_tags(x) pt = xigt_find(inst, type=type, others=[f]) if pt is None: normal_tier = generate_normal_tier(inst) # Create the phrase tier pt = Tier(id=id, type=type, content=normal_tier.id) for normal_line in retrieve_normal_lines(inst, tag): # ------------------------------------------- # Propagate the judgment attribute on the line to the phrase item # ------------------------------------------- phrase_attributes = {} old_judgment = normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) if normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) is not None: phrase_attributes[ODIN_JUDGMENT_ATTRIBUTE] = old_judgment # ------------------------------------------- # Finally, create the phrase item, and # add it to the phrase tier. # ------------------------------------------- pt.append(Item(id=ask_item_id(pt), content=normal_line.id, attributes=phrase_attributes)) inst.append(pt) return pt
def test_remove(self): t = Tier(items=[Item(id='i1'), Item(id='i2')]) assert len(t) == 2 t.remove(t[0]) assert len(t) == 1 assert t[0].id == 'i2' with pytest.raises(KeyError): t['i1']
def words_to_morph_tier(tier, type, id, aln_attribute): """ :param tier: :type tier: Tier :param type: :param id: :param aln_attribute: """ mt = Tier(id=id, attributes={aln_attribute:tier.id}, type=type) # Go through each word... for word in tier: morphs = tokenize_item(word, morpheme_tokenizer) for morph in morphs: # If there is only one morph in the tokenization, don't bother with the indexing, just # use the id. if len(morphs) == 1: aln_str = word.id else: aln_str = create_aln_expr(word.id, morph.start, morph.stop) rm = Item(id=gen_item_id(mt.id, len(mt)), attributes={aln_attribute: aln_str}) mt.append(rm) return mt
def test_get_attribute(self): t = Tier(id='t', attributes={'one': 1, 'two': 2}) igt = Igt(tiers=[t], attributes={'three': 3}) assert t.get_attribute('one') == 1 assert t.get_attribute('two') == 2 assert t.get_attribute('three') is None assert t.get_attribute('three', inherit=True) == 3 assert t.get_attribute('three', default=4) == 4
def test_get_attribute(self): t = Tier(id='t', attributes={'one': 1, 'two': 2}) igt = Igt(tiers=[t], attributes={'three': 3}) self.assertEqual(t.get_attribute('one'), 1) self.assertEqual(t.get_attribute('two'), 2) self.assertIs(t.get_attribute('three'), None) self.assertEqual(t.get_attribute('three', inherit=True), 3) self.assertEqual(t.get_attribute('three', default=4), 4)
def setUp(self): self.t1 = Tier() self.t2 = Tier( id='t', type='basic', attributes={'attr':'val'}, metadata=[Metadata(type='meta', metas=[Meta(text='meta')])], items=[Item(id='t1'), Item(id='t2')] )
def create_word_tier(tag, words, src_item=None): """ Given the word type, a list of the word strings, and optionally the source item that the words are drawn from. If no source item is given, the words will be generated with "text" for each item, otherwise it will segment the provided line. :type words: list[str] :type src_item: xigt.model.Item """ if tag == ODIN_TRANS_TAG: wt = Tier(id=TRANS_WORD_ID, type=TRANS_WORD_TYPE, segmentation=TRANS_PHRASE_ID) aln_attr = SEGMENTATION elif tag == ODIN_GLOSS_TAG: wt = Tier(id=GLOSS_WORD_ID, type=GLOSS_WORD_TYPE, content=NORM_ID, alignment=LANG_WORD_ID) aln_attr = CONTENT elif tag == ODIN_LANG_TAG: wt = Tier(id=LANG_WORD_ID, type=LANG_WORD_TYPE, segmentation=LANG_PHRASE_ID) aln_attr = SEGMENTATION # If we are providing a item that we # want the words to be segmenting, set up # a text string that we'll incrementally trim # and a counter to keep track of the offset from start. src_text = None offset = 0 if src_item: src_text = src_item.value() for w in words: # Use the text from the source line to index # the words being added, rather than making them text. # use a sliding window, kind of like a "pop()" would if src_item: start = src_text.index(w) stop = start+len(w) src_text = src_text[stop:] cur_range = (start+offset, stop+offset) offset += stop if src_item: i = Item(id=ask_item_id(wt), attributes={aln_attr:create_aln_expr(src_item.id, *cur_range)}) else: i = Item(id=ask_item_id(wt), text=w) wt.append(i) if tag == ODIN_GLOSS_TAG: add_word_level_info(wt, INTENT_GLOSS_WORD) return wt
def test_append(self): t = Tier() self.assertRaises(XigtStructureError, t.append, Tier()) self.assertRaises(XigtStructureError, t.append, Igt()) self.assertRaises(XigtStructureError, t.append, XigtCorpus()) self.assertRaises(XigtStructureError, t.append, Metadata()) self.assertRaises(XigtStructureError, t.append, Meta()) self.assertEqual(len(t), 0) t.append(Item(id='t1')) self.assertEqual(len(t), 1) self.assertRaises(XigtError, t.append, Item(id='t1')) t.append(Item(id='t2')) self.assertEqual(len(t), 2) self.assertEqual(t[0].id, 't1') self.assertEqual(t[1].id, 't2')
def test_insert(self): t = Tier() assert len(t) == 0 t.insert(0, Item(id='t1')) assert len(t) == 1 with pytest.raises(XigtError): t.insert(0, Item(id='t1')) t.insert(0, Item(id='t2')) t.insert(100, Item(id='t3')) assert len(t) == 3 assert t[0].id == 't2' assert t[1].id == 't1' assert t[2].id == 't3'
def create_text_tier_from_lines(inst, lines, id_base, state): """ Given a list of lines that are dicts with the attributes 'text' and 'tag', create a text tier of the specified type with the provided line items. :type lines: list[dict] """ # ------------------------------------------- # 1) Generate the parent tier. tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state}) # ------------------------------------------- # 2) Iterate over the list of lines for line in lines: # Make sure the line is a dict. if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line: raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.") # Construct the list of tags. alltags = [] if line.get('tag') is not None: alltags.append(line.get('tag')) if line.get('labels') is not None and line.get('labels'): alltags.append(line.get('labels')) tag_str = '+'.join(alltags) # Construct the attributes line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str} if line.get('judgment') is not None: line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment'] # Add the linenumber if line.get('lineno'): line_attributes['line'] = line.get('lineno', '') l = Item(id=gen_item_id(tier.id, len(tier)), attributes=line_attributes, text=line.get('text')) tier.append(l) return tier
def test_clear(self): t = Tier() t.extend([Item(id='t1'), Item(id='t2'), Item(id='t3')]) assert len(t) == 3 t.clear() assert len(t) == 0 assert t.get(0) is None assert t.get('t1') is None
def test_clear(self): t = Tier() t.extend([Item(id='t1'), Item(id='t2'), Item(id='t3')]) self.assertEqual(len(t), 3) t.clear() self.assertEqual(len(t), 0) self.assertIs(t.get(0), None) self.assertIs(t.get('t1'), None)
def make_igt_raw_tier(block, options): items = [] for j, linedata in enumerate(block.get('lines', [])): text = replace_invalid_xml_chars(linedata.get('content', ''), options['replacement_char']) attrs = linedata.copy() del attrs['content'] items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text)) tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items) return tier
def make_igt_raw_tier(block, options): items = [Item(id='r{}'.format(j+1), attributes=a, text=t) for j, (a, t) in enumerate(block.get('lines', []))] tier = Tier( id='r', type='odin', attributes={'state': 'raw'}, items=items ) return tier
def add_normalized_tier(igt, options): orig_tier = igt.get('c', default=igt['r']) norm_items = normalize_items(orig_tier.items) tier = Tier( id='n', type='odin', alignment=orig_tier.id, attributes={'state': 'normalized'}, items=norm_items ) igt.append(tier)
def make_phrase_tier(tier_id, aln_tokens): return Tier( id=tier_id, type='phrases', items=[ Item( id='{}1'.format(tier_id), text=' '.join(t for aln in aln_tokens for t in aln[1]) ) ] )
def add_cleaned_tier(igt, options): raw_tier = igt['r'] cleaned_items = clean_items(raw_tier.items) tier = Tier( id='c', type='odin', alignment=raw_tier.id, attributes={'state': 'cleaned'}, items=cleaned_items ) igt.append(tier)
def test_resolve_ref(self): # item has no reference attribute b1 = Item(id='b1') self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # has a reference attribute, but is not contained by a tier b1.alignment = 'a1' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in tier, but tier has no reference attribute t_b = Tier(id='b', items=[b1]) self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # tier has reference attribute, but is not contained by an Igt t_b.alignment = 'a' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in IGT, but referred tier doesn't exist igt = Igt(tiers=[t_b]) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred tier exists, but has no item referred by item's alignment t_a = Tier(id='a') igt.append(t_a) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred item exists, but has no value (which resolves to '') a1 = Item(id='a1') t_a.append(a1) self.assertEqual(b1.resolve_ref('alignment'), '') # referred item has a value a1.text = 'text' self.assertEqual(b1.resolve_ref('alignment'), 'text') # stored item tests self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment') self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment') self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text') self.assertEqual(self.i_ac.resolve_ref('content'), 'te') self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt') self.assertEqual(self.i_t.resolve_ref('content'), 'text')
def default_decode_tier(elem): ns, tag = _qname_split(elem.tag) assert tag == 'tier' tier = Tier( id=elem.get('id'), type=elem.get('type'), attributes=get_attributes(elem, ignore=('id', 'type')), metadata=[decode_metadata(md) for md in elem.findall('metadata')], items=[decode_item(item) for item in elem.findall('item')], namespace=ns, nsmap=elem.attrib.nsmap) elem.clear() return tier
def test_extend(self): t = Tier() self.assertEqual(len(t), 0) t.extend([Item(id='t1')]) self.assertEqual(len(t), 1) t.extend([]) self.assertEqual(len(t), 1) t.extend([Item(id='t2'), Item(id='t3')]) self.assertEqual(len(t), 3) self.assertEqual(t[0].id, 't1') self.assertEqual(t[1].id, 't2') self.assertEqual(t[2].id, 't3')
def test_insert(self): t = Tier() self.assertEqual(len(t), 0) t.insert(0, Item(id='t1')) self.assertEqual(len(t), 1) self.assertRaises(XigtError, t.insert, 0, Item(id='t1')) t.insert(0, Item(id='t2')) t.insert(100, Item(id='t3')) self.assertEqual(len(t), 3) self.assertEqual(t[0].id, 't2') self.assertEqual(t[1].id, 't1') self.assertEqual(t[2].id, 't3')
def test_extend(self): t = Tier() assert len(t) == 0 t.extend([Item(id='t1')]) assert len(t) == 1 t.extend([]) assert len(t) == 1 t.extend([Item(id='t2'), Item(id='t3')]) assert len(t) == 3 assert t[0].id == 't1' assert t[1].id == 't2' assert t[2].id == 't3'
def test_get(self): t = Tier(items=[Item(id='a1', content='1'), Item(id='a2', content='2')]) # dictionary key self.assertEqual(t.get('a1').content, '1') self.assertEqual(t.get('a2').content, '2') # list index self.assertEqual(t.get(0).content, '1') self.assertEqual(t.get(1).content, '2') # default value self.assertEqual(t.get('a3'), None) self.assertEqual(t.get('a3', 'z'), 'z') self.assertEqual(t.get(2), None) self.assertEqual(t.get(2, 'z'), 'z')
def add_normalized_tier(igt, base_tier): norm_id = None # check if ID is available for n_id in ('n', 'on', 'normalized', 'odin-normalized'): if igt.get(n_id) is None: norm_id = n_id break if norm_id is None: logging.warning('No preset ID for normalized tier was available ' 'for IGT with id: {}'.format(str(igt.id))) else: norm_items = normalize_items(base_tier, norm_id) tier = Tier(id=norm_id, type='odin', alignment=base_tier.id, attributes={'state': 'normalized'}, items=norm_items) igt.append(tier)
def add_cleaned_tier(igt, raw_tier): clean_id = None # check if ID is available for c_id in ('c', 'oc', 'cleaned', 'odin-cleaned'): if igt.get(c_id) is None: clean_id = c_id break if clean_id is None: logging.warning( 'No preset ID for cleaned tier was available for IGT with id: {}'. format(str(igt.id))) else: cleaned_items = clean_items(raw_tier, clean_id) tier = Tier(id=clean_id, type='odin', alignment=raw_tier.id, attributes={'state': 'cleaned'}, items=cleaned_items) igt.append(tier)
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aligned_tokens == [(None, None)]: pass # nothing to do elif algn_tier is not None: attrs['alignment'] = algn_tier.id algn_data = zip_longest(algn_tier.items, aligned_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data assert tgt_tok == tgt_item.text # FIXME is this necessary? for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={'alignment': tgt_item.id})) i += 1 else: for tgt, src in aligned_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aln_tokens == [(None, None)]: pass # nothing to do elif refattr is not None and algn_tier is not None: attrs[refattr] = algn_tier.id algn_data = zip_longest(algn_tier.items, aln_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={refattr:tgt_item.id}) ) i += 1 else: for tgt, src in aln_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def generate_clean_tier(inst, merge=False, generate=True, force_generate=False): """ If the clean odin tier exists, return it. Otherwise, create it. """ # ------------------------------------------- # Search for the clean tier # ------------------------------------------- clean_tier = xigt_find(inst, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:CLEAN_STATE}) # Remove the clean tier if we are regenerating. if clean_tier is not None and force_generate: inst.remove(clean_tier) # ------------------------------------------- # If we want to force regenerate the tier, or # it is not found and we want to generate it # freshly. # ------------------------------------------- if force_generate or ((clean_tier is None) and generate): # Otherwise, we will make our own: raw_tier = get_raw_tier(inst) # Initialize the clean tier... clean_tier = Tier(id = CLEAN_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:CLEAN_STATE, ALIGNMENT:raw_tier.id}) # Gather the different tags used in this tier. # Note that we don't want to discard non-L,G,T tiers yet. line_tags = DefaultOrderedDict(list) for l in raw_tier: tags = l.attributes['tag'].split('+') primary = tags[0] others = tags[1:] line_tags[primary].append(l) # Now, the line_tags should be indexed by the primary # tag (L, G, T, etc...) with the +'s after it... # Now, go through and merge if needed. for primary_tag in line_tags.keys(): lines = line_tags[primary_tag] # If there is only one line for the given tag, # simply return the first line. if len(lines) == 1: text = lines[0].value() new_tag = lines[0].attributes[ODIN_TAG_ATTRIBUTE] align_id = lines[0].id item_judgment = lines[0].attributes.get(ODIN_JUDGMENT_ATTRIBUTE) # If there are multiple lines for a given tag, # concatenate them to a single line. elif len(lines) > 1: TIER_LOG.info('Corruption detected in instance %s: %s' % (inst.id, [l.attributes['tag'] for l in lines])) for l in lines: TIER_LOG.debug('BEFORE: %s' % l) # The new text should be the concatenation of the multiple lines... text = concat_lines([l.value() for l in lines if l.value() is not None]) TIER_LOG.debug('AFTER: %s' % text) new_tag = primary_tag align_id = ','.join([l.id for l in lines]) item_judgment = None for l in lines: j = l.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) if j is not None: item_judgment = j break # Set up the attributes for the new line item_attributes = {ODIN_TAG_ATTRIBUTE: new_tag} # If we have a judgment, add it to the attributes. # Otherwise, don't add it. if item_judgment is not None: item_attributes[ODIN_JUDGMENT_ATTRIBUTE] = item_judgment item = Item(id=ask_item_id(clean_tier), alignment=align_id, text=text, attributes=item_attributes) clean_tier.add(item) inst.append(clean_tier) return clean_tier # ------------------------------------------- # Finally, if the tier exists # ------------------------------------------- elif clean_tier is not None: return clean_tier # ------------------------------------------- # Otherwise, just return None # ------------------------------------------- else: return None
class TestTier(unittest.TestCase): def setUp(self): self.t1 = Tier() self.t2 = Tier( id='t', type='basic', attributes={'attr':'val'}, metadata=[Metadata(type='meta', metas=[Meta(text='meta')])], items=[Item(id='t1'), Item(id='t2')] ) def test_init(self): self.assertRaises(ValueError, Tier, id='1') # invalid id # don't allow multiple items with the same ID self.assertRaises(XigtError, Tier, items=[Item(id='i1'), Item(id='i1')]) def test_id(self): self.assertIs(self.t1.id, None) self.assertEqual(self.t2.id, 't') def test_type(self): self.assertIs(self.t1.type, None) self.assertEqual(self.t2.type, 'basic') def test_items(self): self.assertEqual(len(self.t1._list), 0) self.assertEqual(self.t1.items, []) self.assertEqual(len(self.t2.items), 2) # contained Items should now have their tier specified for i in self.t2.items: self.assertIs(i.tier, self.t2) def test_parents(self): self.assertIs(self.t1.igt, None) self.assertIs(self.t1.corpus, None) self.assertIs(self.t2.igt, None) self.assertIs(self.t2.corpus, None) def test_metadata(self): self.assertEqual(len(self.t1.metadata), 0) self.assertEqual(self.t2.metadata[0].type, 'meta') self.assertEqual(len(self.t2.metadata[0].metas), 1) self.assertEqual(self.t2.metadata[0][0].text, 'meta') def test_attributes(self): self.assertEqual(self.t1.attributes, dict()) self.assertEqual(self.t2.attributes, {'attr':'val'}) def test_reference_attributes(self): # segmentation cannot co-occur with alignment or content self.assertRaises(XigtError, Tier, alignment='a1', segmentation='b1') self.assertRaises(XigtError, Tier, content='a1', segmentation='b1') self.assertIs(self.t1.alignment, None) self.assertIs(self.t1.content, None) self.assertIs(self.t1.segmentation, None) self.assertIs(self.t2.alignment, None) self.assertIs(self.t2.content, None) self.assertIs(self.t2.segmentation, None) def test_get(self): self.assertIs(self.t1.get(0), None) self.assertIs(self.t1.get('t'), None) self.assertEqual(self.t1.get('t', default=1), 1) self.assertEqual(self.t2.get(0).id, 't1') self.assertIs(self.t2.get(2), None) self.assertEqual(self.t2.get('t1').id, 't1') self.assertEqual( self.t2.get('t1', default=Item(id='x')).id, 't1' ) def test_append(self): t = Tier() self.assertRaises(XigtStructureError, t.append, Tier()) self.assertRaises(XigtStructureError, t.append, Igt()) self.assertRaises(XigtStructureError, t.append, XigtCorpus()) self.assertRaises(XigtStructureError, t.append, Metadata()) self.assertRaises(XigtStructureError, t.append, Meta()) self.assertEqual(len(t), 0) t.append(Item(id='t1')) self.assertEqual(len(t), 1) self.assertRaises(XigtError, t.append, Item(id='t1')) t.append(Item(id='t2')) self.assertEqual(len(t), 2) self.assertEqual(t[0].id, 't1') self.assertEqual(t[1].id, 't2') def test_insert(self): t = Tier() self.assertEqual(len(t), 0) t.insert(0, Item(id='t1')) self.assertEqual(len(t), 1) self.assertRaises(XigtError, t.insert, 0, Item(id='t1')) t.insert(0, Item(id='t2')) t.insert(100, Item(id='t3')) self.assertEqual(len(t), 3) self.assertEqual(t[0].id, 't2') self.assertEqual(t[1].id, 't1') self.assertEqual(t[2].id, 't3') def test_extend(self): t = Tier() self.assertEqual(len(t), 0) t.extend([Item(id='t1')]) self.assertEqual(len(t), 1) t.extend([]) self.assertEqual(len(t), 1) t.extend([Item(id='t2'), Item(id='t3')]) self.assertEqual(len(t), 3) self.assertEqual(t[0].id, 't1') self.assertEqual(t[1].id, 't2') self.assertEqual(t[2].id, 't3') def test_clear(self): t = Tier() t.extend([Item(id='t1'), Item(id='t2'), Item(id='t3')]) self.assertEqual(len(t), 3) t.clear() self.assertEqual(len(t), 0) self.assertIs(t.get(0), None) self.assertIs(t.get('t1'), None) def test_get_attribute(self): t = Tier(id='t', attributes={'one': 1, 'two': 2}) igt = Igt(tiers=[t], attributes={'three': 3}) self.assertEqual(t.get_attribute('one'), 1) self.assertEqual(t.get_attribute('two'), 2) self.assertIs(t.get_attribute('three'), None) self.assertEqual(t.get_attribute('three', inherit=True), 3) self.assertEqual(t.get_attribute('three', default=4), 4)
def convert_pml(aln_path, out_path, hindi=True): if hindi: igt_data = retrieve_hindi() else: igt_data = retrieve_naacl() a_root = load_xml(aln_path) doc_a = a_root.find(".//reffile[@name='document_a']").get('href') doc_b = a_root.find(".//reffile[@name='document_b']").get('href') doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a)) doc_b = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b)) # Load the sentences for each document. a_sents, a_glossed = load_sents(doc_a) b_sents, b_glossed = load_sents(doc_b) sent_alignments = a_root.findall(".//body/LM") assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses" xc = XigtCorpus() for sent_alignment in sent_alignments: # Get the sentence id... aln_id = sent_alignment.attrib.get('id') a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1) if a_snt_id not in igt_data: continue # Get the text and tokens from the naacl data. pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id] lang_tokens = lang_txt.split() gloss_tokens = gloss_txt.split() trans_tokens = trans_txt.split() a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1] b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1] word_alignments = sent_alignment.findall('./node_alignments/LM') a_snt, a_edges = a_sents[a_snt_ref] b_snt, b_edges = b_sents[b_snt_ref] assert isinstance(a_snt, Sentence) assert isinstance(b_snt, Sentence) # ------------------------------------------- # Skip sentences if they are not found for whatever reason # ------------------------------------------- if not a_snt or not b_snt: continue # ------------------------------------------- # Start constructing the IGT Instance. # ------------------------------------------- trans_snt, trans_indices = a_snt, a_edges gloss_snt, gloss_indices = b_snt, b_edges if a_glossed: trans_snt, trans_indices = b_snt, b_edges gloss_snt, gloss_indices = a_snt, a_edges # Hindi stuff... if hindi: lang_tokens = [w.text for w in gloss_snt] lang_postags = [w.pos for w in gloss_snt] lang_txt = ' '.join(lang_tokens) trans_tokens = [w.text for w in trans_snt] trans_postags = [w.pos for w in trans_snt] trans_txt = ' '.join(trans_tokens) gloss_tokens = [w.gloss if w.gloss else 'NULL' for w in gloss_snt] gloss_postags = lang_postags gloss_txt = ' '.join(gloss_tokens) inst = Igt(id=re.sub('s-', 'igt', a_snt_ref)) nt = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE}) ll = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt) gl = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt) tl = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt) nt.extend([ll,gl,tl]) inst.append(nt) # ------------------------------------------- # Handle the phrase tiers # ------------------------------------------- generate_lang_phrase_tier(inst) generate_trans_phrase_tier(inst) def process_postags(sent, tokens): postags = [] for i, token in enumerate(tokens): word = sent.getorder(i+1) if word is None: postags.append(None) else: postags.append(word.pos) return postags # ------------------------------------------- # Now, handle the translation words. # ------------------------------------------- tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0]) inst.append(tt) if not hindi: trans_postags = process_postags(trans_snt, trans_tokens) add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL) # ------------------------------------------- # Handle the words tiers... # ------------------------------------------- wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0]) gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl) inst.extend([wt, gwt]) # Quickly set the alignment for the gloss words. for w, gw in zip(wt, gwt): gw.alignment = w.id if not hindi: lang_postags = process_postags(gloss_snt, gloss_tokens) gloss_postags = lang_postags add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL) add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL) create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL) create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL) # ------------------------------------------- # Now, the word alignments. # ------------------------------------------- a = Alignment() for word_alignment in word_alignments: a_ref = word_alignment.find('./a.rf').text.split('#')[1] b_ref = word_alignment.find('./b.rf').text.split('#')[1] a_word = a_snt.getid(a_ref) b_word = b_snt.getid(b_ref) if a_word is None or b_word is None: continue if not hindi: a_idx = a_word.order b_idx = b_word.order else: a_idx = a_snt.index(a_word)+1 b_idx = b_snt.index(b_word)+1 # Make sure the gloss is in the if a_glossed: trans_idx = b_idx lang_idx = a_idx else: trans_idx = a_idx lang_idx = b_idx a.add((trans_idx, lang_idx)) set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL) set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL) xc.append(inst) with open(out_path, 'w', encoding='utf-8') as f: xigtxml.dump(f, xc)
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def test_append(self): t = Tier() with pytest.raises(XigtStructureError): t.append(Tier()) with pytest.raises(XigtStructureError): t.append(Igt()) with pytest.raises(XigtStructureError): t.append(XigtCorpus()) with pytest.raises(XigtStructureError): t.append(Metadata()) with pytest.raises(XigtStructureError): t.append(Meta()) assert len(t) == 0 t.append(Item(id='t1')) assert len(t) == 1 with pytest.raises(XigtError): t.append(Item(id='t1')) t.append(Item(id='t2')) assert len(t) == 2 assert t[0].id == 't1' assert t[1].id == 't2'