Example #1
0
    def test_w_metadata(self):
        self.gw.alignment = None
        self.assertFalse(is_word_level_gloss(self.gw))

        add_word_level_info(self.gw, INTENT_GLOSS_WORD)
        self.assertTrue(is_word_level_gloss(self.gw))

        add_word_level_info(self.gw, INTENT_GLOSS_MORPH)
        self.assertFalse(is_word_level_gloss(self.gw))
Example #2
0
def create_word_tier(tag, words, src_item=None):
    """
    Given the word type, a list of the word strings, and optionally the source item
     that the words are drawn from. If no source item is given, the words will be
     generated with "text" for each item, otherwise it will segment the provided line.

    :type words: list[str]
    :type src_item: xigt.model.Item
    """

    if tag == ODIN_TRANS_TAG:
        wt = Tier(id=TRANS_WORD_ID, type=TRANS_WORD_TYPE, segmentation=TRANS_PHRASE_ID)
        aln_attr = SEGMENTATION
    elif tag == ODIN_GLOSS_TAG:
        wt = Tier(id=GLOSS_WORD_ID, type=GLOSS_WORD_TYPE, content=NORM_ID, alignment=LANG_WORD_ID)
        aln_attr = CONTENT
    elif tag == ODIN_LANG_TAG:
        wt = Tier(id=LANG_WORD_ID, type=LANG_WORD_TYPE, segmentation=LANG_PHRASE_ID)
        aln_attr = SEGMENTATION


    # If we are providing a item that we
    # want the words to be segmenting, set up
    # a text string that we'll incrementally trim
    # and a counter to keep track of the offset from start.
    src_text = None
    offset   = 0
    if src_item:
        src_text = src_item.value()

    for w in words:

        # Use the text from the source line to index
        # the words being added, rather than making them text.
        # use a sliding window, kind of like a "pop()" would
        if src_item:
            start = src_text.index(w)
            stop  = start+len(w)
            src_text = src_text[stop:]
            cur_range = (start+offset, stop+offset)
            offset += stop

        if src_item:
            i = Item(id=ask_item_id(wt), attributes={aln_attr:create_aln_expr(src_item.id, *cur_range)})
        else:
            i = Item(id=ask_item_id(wt), text=w)

        wt.append(i)

    if tag == ODIN_GLOSS_TAG:
        add_word_level_info(wt, INTENT_GLOSS_WORD)

    return wt
Example #3
0
def generate_gloss_words(inst, create=True):
    """
    Given an IGT instance, create the gloss word-level tier.

    1. If a "glosses" type exists, either referencing the .
    2. If it does not exist, tokenize the gloss line and return it.
    3. If there are NO tokens on the gloss line for whatever reason... Return None.

    :param inst: Instance which to create the tiers from.
    :type inst: RGIgt
    :rtype: RGWordTier
    """

    # 1. Look for an existing words tier that aligns with the normalized tier...
    gloss_tier = xigt_find(inst, type=GLOSS_WORD_TYPE,
                   # Add the "others" to find only the "glosses" tiers that
                   # are at the word level...

                           # TODO FIXME: Find more elegant solution
                           others=[lambda x: is_word_level_gloss(x),
                                   lambda x: ODIN_GLOSS_TAG in odin_tags(x)])

    # 2. If it exists, return it. Otherwise, look for the glosses tier.
    if gloss_tier is None:
        if create:
            n = generate_normal_tier(inst)
            gloss_line_item = retrieve_normal_lines(inst, ODIN_GLOSS_TAG)[0]

            # If the value of the gloss line is None, or it's simply an empty string...
            if gloss_line_item is None or gloss_line_item.value() is None or not gloss_line_item.value().strip():
                raise EmptyGlossException()
            else:
                gloss_tier = create_words_tier(gloss_line_item, GLOSS_WORD_ID,
                                               GLOSS_WORD_TYPE, aln_attribute=CONTENT,
                                               tokenizer=whitespace_tokenizer)

            # Set the "gloss type" to the "word-level"
            add_word_level_info(gloss_tier, INTENT_GLOSS_WORD)
            inst.append(gloss_tier)
            return gloss_tier

        else:
            return None

    else:
        # If we have alignment, we can remove the metadata, because
        # that indicates the type for us.
        if gloss_tier.alignment is not None:
            remove_word_level_info(gloss_tier)

        return gloss_tier
Example #4
0
def glosses(inst) -> Tier:
    # Make sure that we don't pick up the gloss-word tier by accident.
    f = [lambda x: not is_word_level_gloss(x)]

    gt = xigt_find(inst, type=GLOSS_MORPH_TYPE, others=f)

    # If we don't already have a sub-token-level glosses tier, let's create
    # it. Remembering that we want to use CONTENT to align the tier, not
    # SEGMENTATION.
    if gt is None:
        gt = words_to_morph_tier(gloss(inst), GLOSS_MORPH_TYPE, GLOSS_MORPH_ID, CONTENT)

        # Add the meta information that this is not a word-level gloss.
        add_word_level_info(gt, INTENT_GLOSS_MORPH)
        inst.append(gt)

    # If we have alignment, remove the metadata attribute.
    if gt.alignment is not None:
        remove_word_level_info(gt)

    return gt