def generate_phrase_tier(inst, tag, id, type) -> Tier: """ Retrieve a phrase for the given tag, with the provided id and type. """ f = lambda x: tag in odin_tags(x) pt = xigt_find(inst, type=type, others=[f]) if pt is None: normal_tier = generate_normal_tier(inst) # Create the phrase tier pt = Tier(id=id, type=type, content=normal_tier.id) for normal_line in retrieve_normal_lines(inst, tag): # ------------------------------------------- # Propagate the judgment attribute on the line to the phrase item # ------------------------------------------- phrase_attributes = {} old_judgment = normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) if normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) is not None: phrase_attributes[ODIN_JUDGMENT_ATTRIBUTE] = old_judgment # ------------------------------------------- # Finally, create the phrase item, and # add it to the phrase tier. # ------------------------------------------- pt.append(Item(id=ask_item_id(pt), content=normal_line.id, attributes=phrase_attributes)) inst.append(pt) return pt
def words_to_morph_tier(tier, type, id, aln_attribute): """ :param tier: :type tier: Tier :param type: :param id: :param aln_attribute: """ mt = Tier(id=id, attributes={aln_attribute:tier.id}, type=type) # Go through each word... for word in tier: morphs = tokenize_item(word, morpheme_tokenizer) for morph in morphs: # If there is only one morph in the tokenization, don't bother with the indexing, just # use the id. if len(morphs) == 1: aln_str = word.id else: aln_str = create_aln_expr(word.id, morph.start, morph.stop) rm = Item(id=gen_item_id(mt.id, len(mt)), attributes={aln_attribute: aln_str}) mt.append(rm) return mt
def create_words_tier(cur_item, word_id, word_type, aln_attribute = SEGMENTATION, tokenizer=sentence_tokenizer): """ Create a words tier from an ODIN line type item. :param cur_item: Either a phrase item or a line item to tokenize and create words form. :type cur_item: RGItem :param word_id: The ID for this tier. :type word_id: str :param word_type: Tier type for this tier. :type word_type: str :rtype: RGWordTier """ # For the edge case in which the gloss line is defined, but empty. if cur_item.value() is None or not cur_item.value().strip(): words = [] else: # Tokenize the words in this phrase... words = tokenize_item(cur_item, tokenizer=tokenizer) # Create a new word tier to hold the tokenized words... wt = Tier(id = word_id, type=word_type, attributes={aln_attribute:cur_item.tier.id}, igt=cur_item.igt) for w in words: # Create a new word that is a segmentation of this tier. rw = Item(id=gen_item_id(wt.id, len(wt)), attributes={aln_attribute:create_aln_expr(cur_item.id, w.start, w.stop)}, tier=wt) wt.append(rw) return wt
def create_word_tier(tag, words, src_item=None): """ Given the word type, a list of the word strings, and optionally the source item that the words are drawn from. If no source item is given, the words will be generated with "text" for each item, otherwise it will segment the provided line. :type words: list[str] :type src_item: xigt.model.Item """ if tag == ODIN_TRANS_TAG: wt = Tier(id=TRANS_WORD_ID, type=TRANS_WORD_TYPE, segmentation=TRANS_PHRASE_ID) aln_attr = SEGMENTATION elif tag == ODIN_GLOSS_TAG: wt = Tier(id=GLOSS_WORD_ID, type=GLOSS_WORD_TYPE, content=NORM_ID, alignment=LANG_WORD_ID) aln_attr = CONTENT elif tag == ODIN_LANG_TAG: wt = Tier(id=LANG_WORD_ID, type=LANG_WORD_TYPE, segmentation=LANG_PHRASE_ID) aln_attr = SEGMENTATION # If we are providing a item that we # want the words to be segmenting, set up # a text string that we'll incrementally trim # and a counter to keep track of the offset from start. src_text = None offset = 0 if src_item: src_text = src_item.value() for w in words: # Use the text from the source line to index # the words being added, rather than making them text. # use a sliding window, kind of like a "pop()" would if src_item: start = src_text.index(w) stop = start+len(w) src_text = src_text[stop:] cur_range = (start+offset, stop+offset) offset += stop if src_item: i = Item(id=ask_item_id(wt), attributes={aln_attr:create_aln_expr(src_item.id, *cur_range)}) else: i = Item(id=ask_item_id(wt), text=w) wt.append(i) if tag == ODIN_GLOSS_TAG: add_word_level_info(wt, INTENT_GLOSS_WORD) return wt
def test_append(self): t = Tier() self.assertRaises(XigtStructureError, t.append, Tier()) self.assertRaises(XigtStructureError, t.append, Igt()) self.assertRaises(XigtStructureError, t.append, XigtCorpus()) self.assertRaises(XigtStructureError, t.append, Metadata()) self.assertRaises(XigtStructureError, t.append, Meta()) self.assertEqual(len(t), 0) t.append(Item(id='t1')) self.assertEqual(len(t), 1) self.assertRaises(XigtError, t.append, Item(id='t1')) t.append(Item(id='t2')) self.assertEqual(len(t), 2) self.assertEqual(t[0].id, 't1') self.assertEqual(t[1].id, 't2')
def create_text_tier_from_lines(inst, lines, id_base, state): """ Given a list of lines that are dicts with the attributes 'text' and 'tag', create a text tier of the specified type with the provided line items. :type lines: list[dict] """ # ------------------------------------------- # 1) Generate the parent tier. tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state}) # ------------------------------------------- # 2) Iterate over the list of lines for line in lines: # Make sure the line is a dict. if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line: raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.") # Construct the list of tags. alltags = [] if line.get('tag') is not None: alltags.append(line.get('tag')) if line.get('labels') is not None and line.get('labels'): alltags.append(line.get('labels')) tag_str = '+'.join(alltags) # Construct the attributes line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str} if line.get('judgment') is not None: line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment'] # Add the linenumber if line.get('lineno'): line_attributes['line'] = line.get('lineno', '') l = Item(id=gen_item_id(tier.id, len(tier)), attributes=line_attributes, text=line.get('text')) tier.append(l) return tier
def test_resolve_ref(self): # item has no reference attribute b1 = Item(id='b1') self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # has a reference attribute, but is not contained by a tier b1.alignment = 'a1' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in tier, but tier has no reference attribute t_b = Tier(id='b', items=[b1]) self.assertRaises(KeyError, b1.resolve_ref, 'alignment') # tier has reference attribute, but is not contained by an Igt t_b.alignment = 'a' self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # item in IGT, but referred tier doesn't exist igt = Igt(tiers=[t_b]) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred tier exists, but has no item referred by item's alignment t_a = Tier(id='a') igt.append(t_a) self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment') # referred item exists, but has no value (which resolves to '') a1 = Item(id='a1') t_a.append(a1) self.assertEqual(b1.resolve_ref('alignment'), '') # referred item has a value a1.text = 'text' self.assertEqual(b1.resolve_ref('alignment'), 'text') # stored item tests self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment') self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment') self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text') self.assertEqual(self.i_ac.resolve_ref('content'), 'te') self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt') self.assertEqual(self.i_t.resolve_ref('content'), 'text')
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def test_append(self): t = Tier() with pytest.raises(XigtStructureError): t.append(Tier()) with pytest.raises(XigtStructureError): t.append(Igt()) with pytest.raises(XigtStructureError): t.append(XigtCorpus()) with pytest.raises(XigtStructureError): t.append(Metadata()) with pytest.raises(XigtStructureError): t.append(Meta()) assert len(t) == 0 t.append(Item(id='t1')) assert len(t) == 1 with pytest.raises(XigtError): t.append(Item(id='t1')) t.append(Item(id='t2')) assert len(t) == 2 assert t[0].id == 't1' assert t[1].id == 't2'