def create_words_tier_from_string(string): tokens = tokenize_string(string, tokenizer=whitespace_tokenizer) wt = Tier(type=WORDS_TYPE) for token in tokens: i = Item(id=ask_item_id(wt), text=token.value()) wt.append(i) return wt
def parse_odin_inst(string, corpus = None, idnum=None): """ Method to parse and create an IGT instance from odin-style text. """ # Start by looking for the doc_id, and the line range. doc_re = re.search('doc_id=(\S+)\s([0-9]+)\s([0-9]+)\s(.*)\n', string) docid, lnstart, lnstop, tagtypes = doc_re.groups() if idnum is not None: id = gen_item_id('i', idnum) elif corpus: id = corpus.askIgtId() else: corpus = XigtCorpus() id = 'i{}'.format(len(corpus)) inst = Igt(id = id, attributes={'doc-id':docid, 'line-range':'%s %s' % (lnstart, lnstop), 'tag-types':tagtypes}) # Now, find all the lines lines = re.findall('line=([0-9]+)\stag=(\S+):(.*)\n?', string) # --- 3) Create a raw tier. rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst) for lineno, linetag, linetxt in lines: l = Item(id = ask_item_id(rt), text=linetxt, attributes={'tag':linetag, 'line':lineno}, tier=rt) rt.append(l) inst.append(rt) basic_processing(inst) return inst
def xigt_add_dependencies(xigt_inst: Igt, phrase: Phrase, method: str): """ Given a phrase that has a dependency structure analysis, render it into """ # Skip adding dependency structure if none exists for this phrase. if not phrase.dependency_structure: return dep_tier_id = generate_tier_id(xigt_inst, 'dependencies', phrase.id) dep_tier = Tier(type='dependencies', id=dep_tier_id, attributes={ 'dep': phrase.id, 'head': phrase.id, DATA_PROV_KEY: INTENT2_DATA_PROV, DATA_METHOD_KEY: method, DATA_TIME_KEY: add_timestamp() }) for i, dep_link in enumerate( sorted(phrase.dependency_structure, key=lambda link: link.child.index)): dep_item = Item(id='{}_dep{}'.format(dep_tier_id, i + 1), attributes={'dep': dep_link.child.id}) if dep_link.parent: dep_item.attributes['head'] = dep_link.parent.id if dep_link.type: dep_item.text = dep_link.type dep_tier.append(dep_item) if dep_tier: xigt_inst.append(dep_tier)
def xigt_add_bilingual_alignment(xigt_inst: Igt, trans: Phrase, method): """ Given the translation phrase object, add the encoded alignments to a bilingual-alignments tier. """ tw_to_g_tier_id = generate_tier_id(xigt_inst, 'bilingual-alignments', TRANS_WORD_ID, GLOSS_SUBWORD_ID) tw_to_g_tier = Tier(id=tw_to_g_tier_id, type='bilingual-alignments', attributes={ 'source': TRANS_WORD_ID, 'target': GLOSS_SUBWORD_ID, DATA_PROV_KEY: INTENT2_DATA_PROV, DATA_METHOD_KEY: method, DATA_TIME_KEY: add_timestamp() }) tw_to_lw_id = generate_tier_id(xigt_inst, 'bilingual-alignments', TRANS_WORD_ID, LANG_WORD_ID) tw_to_lw_tier = Tier(id=tw_to_lw_id, type='bilingual-alignments', attributes={ 'source': TRANS_WORD_ID, 'target': LANG_WORD_ID, DATA_PROV_KEY: INTENT2_DATA_PROV, DATA_TIME_KEY: add_timestamp() }) for t_w in trans: # type: TransWord for aligned_gloss in [ item for item in t_w.alignments if isinstance(item, SubWord) ]: assert t_w.id is not None assert aligned_gloss.id is not None, aligned_gloss aln_item = Item(id='{}_{}'.format(tw_to_g_tier_id, len(tw_to_g_tier) + 1), attributes={ 'source': t_w.id, 'target': aligned_gloss.id }) tw_to_g_tier.append(aln_item) for l_w in t_w.aligned_lang_words: tw_lw_item = Item(id='{}_{}'.format(tw_to_lw_id, len(tw_to_lw_tier) + 1), attributes={ 'source': t_w.id, 'target': l_w.id }) tw_to_lw_tier.append(tw_lw_item) # Only append if it's not empty. if tw_to_g_tier: xigt_inst.append(tw_to_g_tier) if tw_to_lw_tier: xigt_inst.append(tw_to_lw_tier)
def xigt_add_pos(xigt_inst: Igt, tokens: List[Union[Word, SubWord]], tgt_id: str, method): """ Given a xigt instance, and list of tagged words or subwords, add an appropriate pos-tagged tier to the Xigt instance. """ pos_tier_id = generate_tier_id(xigt_inst, 'pos', tgt_id) pos_tier = Tier(type='pos', id=pos_tier_id, alignment=tgt_id, attributes={ DATA_PROV_KEY: INTENT2_DATA_PROV, DATA_METHOD_KEY: method, DATA_TIME_KEY: add_timestamp() }) for i, token in enumerate(tokens): if token.pos: token_id = '{}_{}'.format(pos_tier_id, i + 1) pos_item = Item(text=token.pos, id=token_id, alignment=token.id) pos_tier.append(pos_item) if pos_tier.items: xigt_inst.append(pos_tier) return pos_tier
def raw_txt_to_inst(string, corpus=None, idnum=None): """ Method to create an IGT instance from a raw three lines of text, assuming L-G-T. :param string: :param corpus: :param idnum: """ lines = string.split('\n') if len(lines) < 3: raise RawTextParseError("Three lines are assumed for raw text. Instead got {}".format(len(lines))) if idnum is not None: id = gen_item_id('i', idnum) elif corpus: id = corpus.askIgtId() else: corpus = XigtCorpus() id = 'i{}'.format(len(corpus)) inst = Igt(id = id) rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst) for i, l in enumerate(lines): # If we have four lines, assume that the first is # native orthography if len(lines) == 4: if i == 0: linetag = ODIN_LANG_TAG + '+FR' if i == 1: linetag = ODIN_LANG_TAG if i == 2: linetag = ODIN_GLOSS_TAG if i == 3: linetag = ODIN_TRANS_TAG elif len(lines) == 3: if i == 0: linetag = ODIN_LANG_TAG elif i == 1: linetag = ODIN_GLOSS_TAG elif i == 2: linetag = ODIN_TRANS_TAG elif len(lines) == 2: if i == 0: linetag = ODIN_LANG_TAG if i == 1: linetag = ODIN_TRANS_TAG else: raise RawTextParseError("Unknown number of lines...") if not l.strip(): raise RawTextParseError("The {} line is empty: {}".format(linetag, l)) li = Item(id=ask_item_id(rt), text=l, attributes={'tag':linetag}) rt.append(li) inst.append(rt) # CONVERT_LOG.warn("Basic processing failed for instance {}".format(inst.id)) return inst
def tier_to_xigt(igt: Igt, phrase: Phrase, phrase_type: str): """ Given """ # -- 0) Do nothing if the phrase is empty. if not phrase: return # -- 1) Create phrase tier if expected, and if # there is a phrase to serialize. phrase_dict = get_xigt_str([phrase_type, PHRASE_KEY]) if phrase and phrase_dict: phrase_tier = Tier( type=phrase_dict[TYPE_KEY], id=phrase_dict[ID_KEY], items=[Item(id=phrase_dict[ID_KEY] + '1', text=phrase.hyphenated)]) igt.append(phrase_tier) # -- 1a) Create the word tier and associated POS tier. word_dict = get_xigt_str([phrase_type, WORDS_KEY]) if word_dict: word_tier = Tier(type=word_dict[TYPE_KEY], id=word_dict[ID_KEY], alignment=word_dict.get(ALN_KEY), segmentation=word_dict.get(SEG_KEY)) # -- 1b) Create the subword tier. (Only add segmentation # if we're also creating a word tier; e.g. lang, not gloss) sw_dict = get_xigt_str([phrase_type, SUBWORDS_KEY]) if sw_dict: subword_tier = Tier(type=sw_dict[TYPE_KEY], id=sw_dict[ID_KEY]) if word_dict: subword_tier.segmentation = word_dict[ID_KEY] num_subwords = 0 # -- 2) Iterate through the words. for word_i, word in enumerate(phrase): # type: int, Word if word_dict: word_id = word.id if word.id else '{}{}'.format( word_dict[ID_KEY], word_i + 1) # The ID for this word token alignments = [word.id for word in word.aligned_words()] word_item = Item( text=word.hyphenated, id=word_id, alignment=','.join(alignments) if alignments else None) word_tier.append(word_item) # -- 3) Iterate through subwords. if sw_dict: for subword in word: subword_id = subword.id if subword.id else '{}{}'.format( sw_dict[ID_KEY], num_subwords + 1) subword_item = Item(text=subword.hyphenated, id=subword_id) if subword.alignments and sw_dict.get(ALN_KEY): subword_item.alignment = ','.join([ a.id for a in subword.alignments if a.id and isinstance(a, SubWord) ]) subword_tier.alignment = sw_dict[ALN_KEY] if word_dict: subword_item.segmentation = word_id subword_tier.append(subword_item) num_subwords += 1 # Make sure to increment the index # Add our tiers to the parent Igt instance if word_dict: igt.append(word_tier) if sw_dict: igt.append(subword_tier)