def create_words_tier(cur_item, word_id, word_type, aln_attribute = SEGMENTATION, tokenizer=sentence_tokenizer): """ Create a words tier from an ODIN line type item. :param cur_item: Either a phrase item or a line item to tokenize and create words form. :type cur_item: RGItem :param word_id: The ID for this tier. :type word_id: str :param word_type: Tier type for this tier. :type word_type: str :rtype: RGWordTier """ # For the edge case in which the gloss line is defined, but empty. if cur_item.value() is None or not cur_item.value().strip(): words = [] else: # Tokenize the words in this phrase... words = tokenize_item(cur_item, tokenizer=tokenizer) # Create a new word tier to hold the tokenized words... wt = Tier(id = word_id, type=word_type, attributes={aln_attribute:cur_item.tier.id}, igt=cur_item.igt) for w in words: # Create a new word that is a segmentation of this tier. rw = Item(id=gen_item_id(wt.id, len(wt)), attributes={aln_attribute:create_aln_expr(cur_item.id, w.start, w.stop)}, tier=wt) wt.append(rw) return wt
def words_to_morph_tier(tier, type, id, aln_attribute): """ :param tier: :type tier: Tier :param type: :param id: :param aln_attribute: """ mt = Tier(id=id, attributes={aln_attribute:tier.id}, type=type) # Go through each word... for word in tier: morphs = tokenize_item(word, morpheme_tokenizer) for morph in morphs: # If there is only one morph in the tokenization, don't bother with the indexing, just # use the id. if len(morphs) == 1: aln_str = word.id else: aln_str = create_aln_expr(word.id, morph.start, morph.stop) rm = Item(id=gen_item_id(mt.id, len(mt)), attributes={aln_attribute: aln_str}) mt.append(rm) return mt
def _process_file(f): c = TwoLevelCountDict() d = TwoLevelCountDict() m = TwoLevelCountDict() print("Processing file {}".format(f)) xc = xc_load(f) for inst in xc: LOG.info("Now on instance {}".format(inst.id)) # Search for the gloss POS tier, if it exists. gpos = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE) # If a gloss POS tier was found... if gpos: # Iterate through the projected tags. for gp in gpos: word = gp.igt.find(id=gp.attributes[ALIGNMENT]) grams = tokenize_item(word, morpheme_tokenizer) # Add the (gram, POSTag) pair as something that was encountered. for gram in grams: m.add(gram.content.lower(), gp.value()) c.add(gp.value(), word.value().lower()) d.add(word.value().lower(), gp.value()) return (c, d, m)