Beispiel #1
0
def create_words_tier(cur_item, word_id, word_type, aln_attribute = SEGMENTATION, tokenizer=sentence_tokenizer):
    """
    Create a words tier from an ODIN line type item.

    :param cur_item: Either a phrase item or a line item to tokenize and create words form.
    :type cur_item: RGItem
    :param word_id: The ID for this tier.
    :type word_id: str
    :param word_type: Tier type for this tier.
    :type word_type: str

    :rtype: RGWordTier
    """

    # For the edge case in which the gloss line is defined, but empty.
    if cur_item.value() is None or not cur_item.value().strip():
        words = []
    else:
        # Tokenize the words in this phrase...
        words = tokenize_item(cur_item, tokenizer=tokenizer)

    # Create a new word tier to hold the tokenized words...
    wt = Tier(id = word_id, type=word_type, attributes={aln_attribute:cur_item.tier.id}, igt=cur_item.igt)

    for w in words:
        # Create a new word that is a segmentation of this tier.
        rw = Item(id=gen_item_id(wt.id, len(wt)),
                  attributes={aln_attribute:create_aln_expr(cur_item.id, w.start, w.stop)}, tier=wt)
        wt.append(rw)

    return wt
Beispiel #2
0
def generate_phrase_tier(inst, tag, id, type) -> Tier:
    """
    Retrieve a phrase for the given tag, with the provided id and type.
    """

    f = lambda x: tag in odin_tags(x)
    pt = xigt_find(inst, type=type, others=[f])


    if pt is None:
        normal_tier = generate_normal_tier(inst)

        # Create the phrase tier
        pt = Tier(id=id, type=type, content=normal_tier.id)

        for normal_line in retrieve_normal_lines(inst, tag):

            # -------------------------------------------
            # Propagate the judgment attribute on the line to the phrase item
            # -------------------------------------------
            phrase_attributes = {}
            old_judgment = normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE)
            if normal_line.attributes.get(ODIN_JUDGMENT_ATTRIBUTE) is not None:
                phrase_attributes[ODIN_JUDGMENT_ATTRIBUTE] = old_judgment

            # -------------------------------------------
            # Finally, create the phrase item, and
            # add it to the phrase tier.
            # -------------------------------------------
            pt.append(Item(id=ask_item_id(pt), content=normal_line.id, attributes=phrase_attributes))
        inst.append(pt)

    return pt
Beispiel #3
0
 def test_remove(self):
     t = Tier(items=[Item(id='i1'), Item(id='i2')])
     assert len(t) == 2
     t.remove(t[0])
     assert len(t) == 1
     assert t[0].id == 'i2'
     with pytest.raises(KeyError): t['i1']
Beispiel #4
0
def words_to_morph_tier(tier, type, id, aln_attribute):
    """
    :param tier:
     :type tier: Tier

    :param type:
    :param id:
    :param aln_attribute:
    """

    mt = Tier(id=id, attributes={aln_attribute:tier.id}, type=type)

    # Go through each word...
    for word in tier:

        morphs = tokenize_item(word, morpheme_tokenizer)

        for morph in morphs:
            # If there is only one morph in the tokenization, don't bother with the indexing, just
            # use the id.
            if len(morphs) == 1:
                aln_str = word.id
            else:
                aln_str = create_aln_expr(word.id, morph.start, morph.stop)

            rm = Item(id=gen_item_id(mt.id, len(mt)),
                      attributes={aln_attribute: aln_str})
            mt.append(rm)

    return mt
Beispiel #5
0
 def test_get_attribute(self):
     t = Tier(id='t', attributes={'one': 1, 'two': 2})
     igt = Igt(tiers=[t], attributes={'three': 3})
     assert t.get_attribute('one') == 1
     assert t.get_attribute('two') == 2
     assert t.get_attribute('three') is None
     assert t.get_attribute('three', inherit=True) == 3
     assert t.get_attribute('three', default=4) == 4
Beispiel #6
0
 def test_get_attribute(self):
     t = Tier(id='t', attributes={'one': 1, 'two': 2})
     igt = Igt(tiers=[t], attributes={'three': 3})
     self.assertEqual(t.get_attribute('one'), 1)
     self.assertEqual(t.get_attribute('two'), 2)
     self.assertIs(t.get_attribute('three'), None)
     self.assertEqual(t.get_attribute('three', inherit=True), 3)
     self.assertEqual(t.get_attribute('three', default=4), 4)
Beispiel #7
0
    def setUp(self):
        self.t1 = Tier()

        self.t2 = Tier(
            id='t',
            type='basic',
            attributes={'attr':'val'},
            metadata=[Metadata(type='meta', metas=[Meta(text='meta')])],
            items=[Item(id='t1'), Item(id='t2')]
        )
Beispiel #8
0
def create_word_tier(tag, words, src_item=None):
    """
    Given the word type, a list of the word strings, and optionally the source item
     that the words are drawn from. If no source item is given, the words will be
     generated with "text" for each item, otherwise it will segment the provided line.

    :type words: list[str]
    :type src_item: xigt.model.Item
    """

    if tag == ODIN_TRANS_TAG:
        wt = Tier(id=TRANS_WORD_ID, type=TRANS_WORD_TYPE, segmentation=TRANS_PHRASE_ID)
        aln_attr = SEGMENTATION
    elif tag == ODIN_GLOSS_TAG:
        wt = Tier(id=GLOSS_WORD_ID, type=GLOSS_WORD_TYPE, content=NORM_ID, alignment=LANG_WORD_ID)
        aln_attr = CONTENT
    elif tag == ODIN_LANG_TAG:
        wt = Tier(id=LANG_WORD_ID, type=LANG_WORD_TYPE, segmentation=LANG_PHRASE_ID)
        aln_attr = SEGMENTATION


    # If we are providing a item that we
    # want the words to be segmenting, set up
    # a text string that we'll incrementally trim
    # and a counter to keep track of the offset from start.
    src_text = None
    offset   = 0
    if src_item:
        src_text = src_item.value()

    for w in words:

        # Use the text from the source line to index
        # the words being added, rather than making them text.
        # use a sliding window, kind of like a "pop()" would
        if src_item:
            start = src_text.index(w)
            stop  = start+len(w)
            src_text = src_text[stop:]
            cur_range = (start+offset, stop+offset)
            offset += stop

        if src_item:
            i = Item(id=ask_item_id(wt), attributes={aln_attr:create_aln_expr(src_item.id, *cur_range)})
        else:
            i = Item(id=ask_item_id(wt), text=w)

        wt.append(i)

    if tag == ODIN_GLOSS_TAG:
        add_word_level_info(wt, INTENT_GLOSS_WORD)

    return wt
Beispiel #9
0
 def test_append(self):
     t = Tier()
     self.assertRaises(XigtStructureError, t.append, Tier())
     self.assertRaises(XigtStructureError, t.append, Igt())
     self.assertRaises(XigtStructureError, t.append, XigtCorpus())
     self.assertRaises(XigtStructureError, t.append, Metadata())
     self.assertRaises(XigtStructureError, t.append, Meta())
     self.assertEqual(len(t), 0)
     t.append(Item(id='t1'))
     self.assertEqual(len(t), 1)
     self.assertRaises(XigtError, t.append, Item(id='t1'))
     t.append(Item(id='t2'))
     self.assertEqual(len(t), 2)
     self.assertEqual(t[0].id, 't1')
     self.assertEqual(t[1].id, 't2')
Beispiel #10
0
 def test_insert(self):
     t = Tier()
     assert len(t) == 0
     t.insert(0, Item(id='t1'))
     assert len(t) == 1
     with pytest.raises(XigtError): t.insert(0, Item(id='t1'))
     t.insert(0, Item(id='t2'))
     t.insert(100, Item(id='t3'))
     assert len(t) == 3
     assert t[0].id == 't2'
     assert t[1].id == 't1'
     assert t[2].id == 't3'
Beispiel #11
0
def create_text_tier_from_lines(inst, lines, id_base, state):
    """
    Given a list of lines that are dicts with the attributes 'text' and 'tag', create
    a text tier of the specified type with the provided line items.

    :type lines: list[dict]
    """
    # -------------------------------------------
    # 1) Generate the parent tier.
    tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state})


    # -------------------------------------------
    # 2) Iterate over the list of lines
    for line in lines:

        # Make sure the line is a dict.
        if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line:
            raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.")

        # Construct the list of tags.
        alltags = []
        if line.get('tag') is not None:
            alltags.append(line.get('tag'))
        if line.get('labels') is not None and line.get('labels'):
            alltags.append(line.get('labels'))
        tag_str = '+'.join(alltags)


        # Construct the attributes
        line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str}
        if line.get('judgment') is not None:
            line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment']

        # Add the linenumber
        if line.get('lineno'):
            line_attributes['line'] = line.get('lineno', '')


        l = Item(id=gen_item_id(tier.id, len(tier)),
                   attributes=line_attributes,
                   text=line.get('text'))
        tier.append(l)
    return tier
Beispiel #12
0
 def test_clear(self):
     t = Tier()
     t.extend([Item(id='t1'), Item(id='t2'), Item(id='t3')])
     assert len(t) == 3
     t.clear()
     assert len(t) == 0
     assert t.get(0) is None
     assert t.get('t1') is None
Beispiel #13
0
 def test_clear(self):
     t = Tier()
     t.extend([Item(id='t1'), Item(id='t2'), Item(id='t3')])
     self.assertEqual(len(t), 3)
     t.clear()
     self.assertEqual(len(t), 0)
     self.assertIs(t.get(0), None)
     self.assertIs(t.get('t1'), None)
Beispiel #14
0
def make_igt_raw_tier(block, options):
    items = []
    for j, linedata in enumerate(block.get('lines', [])):
        text = replace_invalid_xml_chars(linedata.get('content', ''),
                                         options['replacement_char'])
        attrs = linedata.copy()
        del attrs['content']
        items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text))
    tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items)
    return tier
Beispiel #15
0
def make_igt_raw_tier(block, options):
    items = [Item(id='r{}'.format(j+1), attributes=a, text=t)
             for j, (a, t) in enumerate(block.get('lines', []))]
    tier = Tier(
        id='r',
        type='odin',
        attributes={'state': 'raw'},
        items=items
    )
    return tier
Beispiel #16
0
def add_normalized_tier(igt, options):
    orig_tier = igt.get('c', default=igt['r'])
    norm_items = normalize_items(orig_tier.items)
    tier = Tier(
        id='n',
        type='odin',
        alignment=orig_tier.id,
        attributes={'state': 'normalized'},
        items=norm_items
    )
    igt.append(tier)
Beispiel #17
0
def make_phrase_tier(tier_id, aln_tokens):
    return Tier(
        id=tier_id,
        type='phrases',
        items=[
            Item(
                id='{}1'.format(tier_id),
                text=' '.join(t for aln in aln_tokens for t in aln[1])
            )
        ]
    )
Beispiel #18
0
def add_cleaned_tier(igt, options):
    raw_tier = igt['r']
    cleaned_items = clean_items(raw_tier.items)
    tier = Tier(
        id='c',
        type='odin',
        alignment=raw_tier.id,
        attributes={'state': 'cleaned'},
        items=cleaned_items
    )
    igt.append(tier)
Beispiel #19
0
    def test_resolve_ref(self):
        # item has no reference attribute
        b1 = Item(id='b1')
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # has a reference attribute, but is not contained by a tier
        b1.alignment = 'a1'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in tier, but tier has no reference attribute
        t_b = Tier(id='b', items=[b1])
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # tier has reference attribute, but is not contained by an Igt
        t_b.alignment = 'a'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in IGT, but referred tier doesn't exist
        igt = Igt(tiers=[t_b])
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred tier exists, but has no item referred by item's alignment
        t_a = Tier(id='a')
        igt.append(t_a)
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred item exists, but has no value (which resolves to '')
        a1 = Item(id='a1')
        t_a.append(a1)
        self.assertEqual(b1.resolve_ref('alignment'), '')
        # referred item has a value
        a1.text = 'text'
        self.assertEqual(b1.resolve_ref('alignment'), 'text')

        # stored item tests
        self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment')

        self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment')

        self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text')
        self.assertEqual(self.i_ac.resolve_ref('content'), 'te')

        self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt')

        self.assertEqual(self.i_t.resolve_ref('content'), 'text')
Beispiel #20
0
def default_decode_tier(elem):
    ns, tag = _qname_split(elem.tag)
    assert tag == 'tier'
    tier = Tier(
        id=elem.get('id'),
        type=elem.get('type'),
        attributes=get_attributes(elem, ignore=('id', 'type')),
        metadata=[decode_metadata(md) for md in elem.findall('metadata')],
        items=[decode_item(item) for item in elem.findall('item')],
        namespace=ns,
        nsmap=elem.attrib.nsmap)
    elem.clear()
    return tier
Beispiel #21
0
 def test_extend(self):
     t = Tier()
     self.assertEqual(len(t), 0)
     t.extend([Item(id='t1')])
     self.assertEqual(len(t), 1)
     t.extend([])
     self.assertEqual(len(t), 1)
     t.extend([Item(id='t2'), Item(id='t3')])
     self.assertEqual(len(t), 3)
     self.assertEqual(t[0].id, 't1')
     self.assertEqual(t[1].id, 't2')
     self.assertEqual(t[2].id, 't3')
Beispiel #22
0
 def test_insert(self):
     t = Tier()
     self.assertEqual(len(t), 0)
     t.insert(0, Item(id='t1'))
     self.assertEqual(len(t), 1)
     self.assertRaises(XigtError, t.insert, 0, Item(id='t1'))
     t.insert(0, Item(id='t2'))
     t.insert(100, Item(id='t3'))
     self.assertEqual(len(t), 3)
     self.assertEqual(t[0].id, 't2')
     self.assertEqual(t[1].id, 't1')
     self.assertEqual(t[2].id, 't3')
Beispiel #23
0
 def test_extend(self):
     t = Tier()
     assert len(t) == 0
     t.extend([Item(id='t1')])
     assert len(t) == 1
     t.extend([])
     assert len(t) == 1
     t.extend([Item(id='t2'), Item(id='t3')])
     assert len(t) == 3
     assert t[0].id == 't1'
     assert t[1].id == 't2'
     assert t[2].id == 't3'
Beispiel #24
0
 def test_get(self):
     t = Tier(items=[Item(id='a1', content='1'),
                     Item(id='a2', content='2')])
     # dictionary key
     self.assertEqual(t.get('a1').content, '1')
     self.assertEqual(t.get('a2').content, '2')
     # list index
     self.assertEqual(t.get(0).content, '1')
     self.assertEqual(t.get(1).content, '2')
     # default value
     self.assertEqual(t.get('a3'), None)
     self.assertEqual(t.get('a3', 'z'), 'z')
     self.assertEqual(t.get(2), None)
     self.assertEqual(t.get(2, 'z'), 'z')
Beispiel #25
0
def add_normalized_tier(igt, base_tier):
    norm_id = None
    # check if ID is available
    for n_id in ('n', 'on', 'normalized', 'odin-normalized'):
        if igt.get(n_id) is None:
            norm_id = n_id
            break
    if norm_id is None:
        logging.warning('No preset ID for normalized tier was available '
                        'for IGT with id: {}'.format(str(igt.id)))
    else:
        norm_items = normalize_items(base_tier, norm_id)
        tier = Tier(id=norm_id,
                    type='odin',
                    alignment=base_tier.id,
                    attributes={'state': 'normalized'},
                    items=norm_items)
        igt.append(tier)
Beispiel #26
0
def add_cleaned_tier(igt, raw_tier):
    clean_id = None
    # check if ID is available
    for c_id in ('c', 'oc', 'cleaned', 'odin-cleaned'):
        if igt.get(c_id) is None:
            clean_id = c_id
            break
    if clean_id is None:
        logging.warning(
            'No preset ID for cleaned tier was available for IGT with id: {}'.
            format(str(igt.id)))
    else:
        cleaned_items = clean_items(raw_tier, clean_id)
        tier = Tier(id=clean_id,
                    type='odin',
                    alignment=raw_tier.id,
                    attributes={'state': 'cleaned'},
                    items=cleaned_items)
        igt.append(tier)
Beispiel #27
0
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aligned_tokens == [(None, None)]:
        pass  # nothing to do
    elif algn_tier is not None:
        attrs['alignment'] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aligned_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            assert tgt_tok == tgt_item.text  # FIXME is this necessary?
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={'alignment': tgt_item.id}))
                i += 1
    else:
        for tgt, src in aligned_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Beispiel #28
0
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aln_tokens == [(None, None)]:
        pass  # nothing to do
    elif refattr is not None and algn_tier is not None:
        attrs[refattr] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aln_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={refattr:tgt_item.id})
                )
                i += 1
    else:
        for tgt, src in aln_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Beispiel #29
0
def generate_clean_tier(inst, merge=False, generate=True, force_generate=False):
    """
    If the clean odin tier exists, return it. Otherwise, create it.

    """

    # -------------------------------------------
    # Search for the clean tier
    # -------------------------------------------
    clean_tier = xigt_find(inst, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:CLEAN_STATE})

    # Remove the clean tier if we are regenerating.
    if clean_tier is not None and force_generate:
        inst.remove(clean_tier)

    # -------------------------------------------
    # If we want to force regenerate the tier, or
    # it is not found and we want to generate it
    # freshly.
    # -------------------------------------------
    if force_generate or ((clean_tier is None) and generate):
        # Otherwise, we will make our own:
        raw_tier = get_raw_tier(inst)


        # Initialize the clean tier...
        clean_tier = Tier(id = CLEAN_ID, type=ODIN_TIER_TYPE,
                          attributes={STATE_ATTRIBUTE:CLEAN_STATE,
                                      ALIGNMENT:raw_tier.id})

        # Gather the different tags used in this tier.
        # Note that we don't want to discard non-L,G,T tiers yet.
        line_tags = DefaultOrderedDict(list)
        for l in raw_tier:
            tags = l.attributes['tag'].split('+')
            primary = tags[0]
            others = tags[1:]
            line_tags[primary].append(l)


        # Now, the line_tags should be indexed by the primary
        # tag (L, G, T, etc...) with the +'s after it...


        # Now, go through and merge if needed.
        for primary_tag in line_tags.keys():

            lines = line_tags[primary_tag]

            # If there is only one line for the given tag,
            # simply return the first line.
            if len(lines) == 1:
                text = lines[0].value()
                new_tag = lines[0].attributes[ODIN_TAG_ATTRIBUTE]
                align_id = lines[0].id
                item_judgment = lines[0].attributes.get(ODIN_JUDGMENT_ATTRIBUTE)

            # If there are multiple lines for a given tag,
            # concatenate them to a single line.
            elif len(lines) > 1:
                TIER_LOG.info('Corruption detected in instance %s: %s' % (inst.id, [l.attributes['tag'] for l in lines]))
                for l in lines:
                    TIER_LOG.debug('BEFORE: %s' % l)

                # The new text should be the concatenation of the multiple lines...
                text = concat_lines([l.value() for l in lines if l.value() is not None])
                TIER_LOG.debug('AFTER: %s' % text)
                new_tag = primary_tag
                align_id = ','.join([l.id for l in lines])

                item_judgment = None
                for l in lines:
                    j = l.attributes.get(ODIN_JUDGMENT_ATTRIBUTE)
                    if j is not None:
                        item_judgment = j
                        break

            # Set up the attributes for the new line
            item_attributes = {ODIN_TAG_ATTRIBUTE: new_tag}

            # If we have a judgment, add it to the attributes.
            # Otherwise, don't add it.
            if item_judgment is not None:
                item_attributes[ODIN_JUDGMENT_ATTRIBUTE] = item_judgment



            item = Item(id=ask_item_id(clean_tier),
                        alignment=align_id, text=text,
                        attributes=item_attributes)
            clean_tier.add(item)

        inst.append(clean_tier)
        return clean_tier

    # -------------------------------------------
    # Finally, if the tier exists
    # -------------------------------------------
    elif clean_tier is not None:
        return clean_tier

    # -------------------------------------------
    # Otherwise, just return None
    # -------------------------------------------
    else:
        return None
Beispiel #30
0
class TestTier(unittest.TestCase):
    def setUp(self):
        self.t1 = Tier()

        self.t2 = Tier(
            id='t',
            type='basic',
            attributes={'attr':'val'},
            metadata=[Metadata(type='meta', metas=[Meta(text='meta')])],
            items=[Item(id='t1'), Item(id='t2')]
        )

    def test_init(self):
        self.assertRaises(ValueError, Tier, id='1')  # invalid id
        # don't allow multiple items with the same ID
        self.assertRaises(XigtError, Tier, items=[Item(id='i1'),
                                                  Item(id='i1')])

    def test_id(self):
        self.assertIs(self.t1.id, None)

        self.assertEqual(self.t2.id, 't')

    def test_type(self):
        self.assertIs(self.t1.type, None)

        self.assertEqual(self.t2.type, 'basic')

    def test_items(self):
        self.assertEqual(len(self.t1._list), 0)
        self.assertEqual(self.t1.items, [])

        self.assertEqual(len(self.t2.items), 2)
        # contained Items should now have their tier specified
        for i in self.t2.items:
            self.assertIs(i.tier, self.t2)

    def test_parents(self):
        self.assertIs(self.t1.igt, None)
        self.assertIs(self.t1.corpus, None)

        self.assertIs(self.t2.igt, None)
        self.assertIs(self.t2.corpus, None)

    def test_metadata(self):
        self.assertEqual(len(self.t1.metadata), 0)

        self.assertEqual(self.t2.metadata[0].type, 'meta')
        self.assertEqual(len(self.t2.metadata[0].metas), 1)
        self.assertEqual(self.t2.metadata[0][0].text, 'meta')

    def test_attributes(self):
        self.assertEqual(self.t1.attributes, dict())

        self.assertEqual(self.t2.attributes, {'attr':'val'})

    def test_reference_attributes(self):
        # segmentation cannot co-occur with alignment or content
        self.assertRaises(XigtError, Tier, alignment='a1', segmentation='b1')
        self.assertRaises(XigtError, Tier, content='a1', segmentation='b1')

        self.assertIs(self.t1.alignment, None)
        self.assertIs(self.t1.content, None)
        self.assertIs(self.t1.segmentation, None)

        self.assertIs(self.t2.alignment, None)
        self.assertIs(self.t2.content, None)
        self.assertIs(self.t2.segmentation, None)

    def test_get(self):
        self.assertIs(self.t1.get(0), None)
        self.assertIs(self.t1.get('t'), None)
        self.assertEqual(self.t1.get('t', default=1), 1)

        self.assertEqual(self.t2.get(0).id, 't1')
        self.assertIs(self.t2.get(2), None)
        self.assertEqual(self.t2.get('t1').id, 't1')
        self.assertEqual(
            self.t2.get('t1', default=Item(id='x')).id, 't1'
        )

    def test_append(self):
        t = Tier()
        self.assertRaises(XigtStructureError, t.append, Tier())
        self.assertRaises(XigtStructureError, t.append, Igt())
        self.assertRaises(XigtStructureError, t.append, XigtCorpus())
        self.assertRaises(XigtStructureError, t.append, Metadata())
        self.assertRaises(XigtStructureError, t.append, Meta())
        self.assertEqual(len(t), 0)
        t.append(Item(id='t1'))
        self.assertEqual(len(t), 1)
        self.assertRaises(XigtError, t.append, Item(id='t1'))
        t.append(Item(id='t2'))
        self.assertEqual(len(t), 2)
        self.assertEqual(t[0].id, 't1')
        self.assertEqual(t[1].id, 't2')

    def test_insert(self):
        t = Tier()
        self.assertEqual(len(t), 0)
        t.insert(0, Item(id='t1'))
        self.assertEqual(len(t), 1)
        self.assertRaises(XigtError, t.insert, 0, Item(id='t1'))
        t.insert(0, Item(id='t2'))
        t.insert(100, Item(id='t3'))
        self.assertEqual(len(t), 3)
        self.assertEqual(t[0].id, 't2')
        self.assertEqual(t[1].id, 't1')
        self.assertEqual(t[2].id, 't3')

    def test_extend(self):
        t = Tier()
        self.assertEqual(len(t), 0)
        t.extend([Item(id='t1')])
        self.assertEqual(len(t), 1)
        t.extend([])
        self.assertEqual(len(t), 1)
        t.extend([Item(id='t2'), Item(id='t3')])
        self.assertEqual(len(t), 3)
        self.assertEqual(t[0].id, 't1')
        self.assertEqual(t[1].id, 't2')
        self.assertEqual(t[2].id, 't3')

    def test_clear(self):
        t = Tier()
        t.extend([Item(id='t1'), Item(id='t2'), Item(id='t3')])
        self.assertEqual(len(t), 3)
        t.clear()
        self.assertEqual(len(t), 0)
        self.assertIs(t.get(0), None)
        self.assertIs(t.get('t1'), None)

    def test_get_attribute(self):
        t = Tier(id='t', attributes={'one': 1, 'two': 2})
        igt = Igt(tiers=[t], attributes={'three': 3})
        self.assertEqual(t.get_attribute('one'), 1)
        self.assertEqual(t.get_attribute('two'), 2)
        self.assertIs(t.get_attribute('three'), None)
        self.assertEqual(t.get_attribute('three', inherit=True), 3)
        self.assertEqual(t.get_attribute('three', default=4), 4)
Beispiel #31
0
def convert_pml(aln_path, out_path, hindi=True):

    if hindi:
        igt_data = retrieve_hindi()
    else:
        igt_data = retrieve_naacl()

    a_root = load_xml(aln_path)
    doc_a  = a_root.find(".//reffile[@name='document_a']").get('href')
    doc_b  = a_root.find(".//reffile[@name='document_b']").get('href')



    doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a))
    doc_b  = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b))

    # Load the sentences for each document.
    a_sents, a_glossed = load_sents(doc_a)
    b_sents, b_glossed = load_sents(doc_b)



    sent_alignments = a_root.findall(".//body/LM")

    assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses"

    xc = XigtCorpus()

    for sent_alignment in sent_alignments:

        # Get the sentence id...
        aln_id = sent_alignment.attrib.get('id')
        a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1)
        if a_snt_id not in igt_data:
            continue

        # Get the text and tokens from the naacl data.
        pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id]
        lang_tokens = lang_txt.split()
        gloss_tokens = gloss_txt.split()
        trans_tokens = trans_txt.split()

        a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1]
        b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1]

        word_alignments = sent_alignment.findall('./node_alignments/LM')

        a_snt, a_edges = a_sents[a_snt_ref]
        b_snt, b_edges = b_sents[b_snt_ref]

        assert isinstance(a_snt, Sentence)
        assert isinstance(b_snt, Sentence)
        # -------------------------------------------
        # Skip sentences if they are not found for whatever reason
        # -------------------------------------------
        if not a_snt or not b_snt:
            continue

        # -------------------------------------------
        # Start constructing the IGT Instance.
        # -------------------------------------------

        trans_snt, trans_indices = a_snt, a_edges
        gloss_snt, gloss_indices = b_snt, b_edges
        if a_glossed:
            trans_snt, trans_indices = b_snt, b_edges
            gloss_snt, gloss_indices = a_snt, a_edges

        # Hindi stuff...
        if hindi:
            lang_tokens = [w.text for w in gloss_snt]
            lang_postags   = [w.pos  for w in gloss_snt]
            lang_txt    = ' '.join(lang_tokens)

            trans_tokens = [w.text for w in trans_snt]
            trans_postags   = [w.pos  for w in trans_snt]
            trans_txt    = ' '.join(trans_tokens)

            gloss_tokens  = [w.gloss if w.gloss else 'NULL' for w in gloss_snt]
            gloss_postags = lang_postags
            gloss_txt     = ' '.join(gloss_tokens)



        inst = Igt(id=re.sub('s-', 'igt', a_snt_ref))
        nt   = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE})
        ll   = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt)
        gl   = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt)
        tl   = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt)
        nt.extend([ll,gl,tl])
        inst.append(nt)


        # -------------------------------------------
        # Handle the phrase tiers
        # -------------------------------------------
        generate_lang_phrase_tier(inst)
        generate_trans_phrase_tier(inst)

        def process_postags(sent, tokens):
            postags = []
            for i, token in enumerate(tokens):
                word = sent.getorder(i+1)
                if word is None:
                    postags.append(None)
                else:
                    postags.append(word.pos)
            return postags

        # -------------------------------------------
        # Now, handle the translation words.
        # -------------------------------------------
        tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0])
        inst.append(tt)

        if not hindi:
            trans_postags = process_postags(trans_snt, trans_tokens)

        add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL)


        # -------------------------------------------
        # Handle the words tiers...
        # -------------------------------------------
        wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0])
        gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl)
        inst.extend([wt, gwt])
        # Quickly set the alignment for the gloss words.
        for w, gw in zip(wt, gwt):
            gw.alignment = w.id


        if not hindi:
            lang_postags = process_postags(gloss_snt, gloss_tokens)
            gloss_postags = lang_postags

        add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL)
        add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL)

        create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL)
        create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL)



        # -------------------------------------------
        # Now, the word alignments.
        # -------------------------------------------
        a = Alignment()
        for word_alignment in word_alignments:
            a_ref = word_alignment.find('./a.rf').text.split('#')[1]
            b_ref = word_alignment.find('./b.rf').text.split('#')[1]

            a_word = a_snt.getid(a_ref)
            b_word = b_snt.getid(b_ref)

            if a_word is None or b_word is None:
                continue

            if not hindi:
                a_idx  = a_word.order
                b_idx  = b_word.order
            else:
                a_idx  = a_snt.index(a_word)+1
                b_idx  = b_snt.index(b_word)+1

            # Make sure the gloss is in the
            if a_glossed:
                trans_idx = b_idx
                lang_idx  = a_idx
            else:
                trans_idx = a_idx
                lang_idx  = b_idx

            a.add((trans_idx, lang_idx))


        set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL)
        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL)

        xc.append(inst)

    with open(out_path, 'w', encoding='utf-8') as f:
        xigtxml.dump(f, xc)
Beispiel #32
0
def naacl_to_xigt(naacl_path):
    """
    Convert the NAACL format to XIGT.

    :param naacl_path:
    """
    content = open(naacl_path, 'r').read()

    # First, collect all the instances.
    instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content)

    xc = XigtCorpus()

    for instance_txt in instances:
        # id = re.search('Igt_id=([\S]+)', instance_txt).group(1)
        inst = Igt(id='i{}'.format(len(xc)))

        lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4]

        # Now, create the raw tier...
        raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE})
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}))
        raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}))

        inst.append(raw_tier)
        xc.append(inst)

        # Generate the clean/normal tiers, but without any cleaning.
        generate_normal_tier(inst, clean=False)

        # Lang Dependency representation handling...
        lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1)
        lang_ds_lines = lang_ds_str.split('\n')[5:-3]

        try:
            lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines)
            create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass

        # Eng DS handling...
        eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1)
        eng_ds_lines = eng_ds_str.split('\n')[2:-3]

        try:
            eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines)
            create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL)
        except TreeError as te:
            pass
        except IndexError as ie:
            pass
        except ValueError as ve:
            pass

        # Add Alignment...
        biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1)
        biling_aln_lines = biling_aln_str.split('\n')[4:-3]

        trans_offset = trans_raw.startswith(' ')
        gloss_offset = gloss_raw.startswith(' ')

        try:
            a = Alignment()
            for line in biling_aln_lines:
                gloss_s, trans_s = line.split()[0:2]

                if '.' in gloss_s:
                    continue

                gloss_i = int(gloss_s)

                for trans_token in trans_s.split(','):
                    trans_i = int(trans_token)
                    if trans_i == 0:
                        continue
                    else:
                        if trans_offset:
                            trans_i -= 1
                        if gloss_offset:
                            gloss_i -= 1
                        a.add((trans_i, gloss_i))
        except:
            pass

        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL)

    return xc
Beispiel #33
0
 def test_append(self):
     t = Tier()
     with pytest.raises(XigtStructureError): t.append(Tier())
     with pytest.raises(XigtStructureError): t.append(Igt())
     with pytest.raises(XigtStructureError): t.append(XigtCorpus())
     with pytest.raises(XigtStructureError): t.append(Metadata())
     with pytest.raises(XigtStructureError): t.append(Meta())
     assert len(t) == 0
     t.append(Item(id='t1'))
     assert len(t) == 1
     with pytest.raises(XigtError): t.append(Item(id='t1'))
     t.append(Item(id='t2'))
     assert len(t) == 2
     assert t[0].id == 't1'
     assert t[1].id == 't2'