Esempio n. 1
0
 def test_basic(self):
     i = Item(id='i1', type='basic', attributes={'attr':'val'},
              content='content')
     self.assertEqual(i.type, 'basic')
     self.assertEqual(i.id, 'i1')
     self.assertEqual(i.tier, None)
     self.assertEqual(i.igt, None)
     self.assertEqual(i.corpus, None)
     self.assertEqual(i.attributes, {'attr':'val'})
     self.assertEqual(i.content, 'content')
     # sub-spans of null content is also null content
     self.assertEqual(i.span(0,1), 'c')
Esempio n. 2
0
 def test_empty(self):
     i = Item()
     # empty members
     self.assertEqual(i.type, None)
     self.assertEqual(i.id, None)
     self.assertEqual(i.tier, None)
     self.assertEqual(i.igt, None)
     self.assertEqual(i.corpus, None)
     self.assertEqual(i.attributes, dict())
     self.assertEqual(i.content, None)
     # sub-spans of null content is also null content
     self.assertEqual(i.span(0,1), None)
Esempio n. 3
0
def remove_citations(items):
    def removable(m, t, i):
        # citation matches are removable if they don't look like
        # translation alternates or bracketed glosses
        if t in ('L', 'G'):
            start, end = m.span()
            other = None
            if t == 'L':  # look down then up for nearest G
                others = items[i + 1:] + items[i - 1::-1]
                t2 = 'G'
            else:  # look up then down for nearest L
                others = items[i - 1:] + items[i - 1::-1]
                t2 = 'L'
            other = next((i for i in others if get_tags(i)[0] == t2), None)
            if other and (other.text or '')[start:end].strip() != '':
                return False
        elif re.match(r'\s*[{}].*[{}]\s*$'.format(OPENQUOTES, CLOSEQUOTES),
                      m.group('inner1') or m.group('inner2'), re.U):
            return False
        return True

    new_items = []
    for i, item in enumerate(items):
        new_items.append(item)  # add now; text might be modified later
        tags = get_tags(item)
        if tags[0] not in ('L', 'G', 'T', 'L-G', 'L-T', 'L-G-T'):
            continue
        match = citation_re.search(item.text)
        if (match and removable(match, tags[0], i)):
            meta_item = Item(id=item.id,
                             text=match.group(0).strip(),
                             attributes=item.attributes)
            m_tags = ['M']
            item.text = citation_re.sub('', item.text).rstrip()
            if 'AC' in tags:
                tags.remove('AC')
                m_tags.append('AC')
            elif 'LN' in tags:
                tags.remove('LN')
                m_tags.append('LN')
            elif 'CN' in tags:
                tags.remove('CN')
                m_tags.append('CN')
            # what about other tags? LN, CN, EX
            item.attributes['tag'] = '+'.join(tags)
            meta_item.attributes['tag'] = '+'.join(m_tags)
            new_items.append(meta_item)
    return new_items
Esempio n. 4
0
def copy_items(items):
    return [
        Item(id=item.id,
             type=item.type,
             attributes=item.attributes,
             text=item.text) for item in items
    ]
Esempio n. 5
0
def copy_items(items):
    return [
        Item(id=item.id, type=item.type, alignment=item.alignment,
             content=item.content, segmentation=item.segmentation,
             attributes=item.attributes, text=item.text)
        for item in items
    ]
Esempio n. 6
0
def make_igt_raw_tier(block, options):
    items = [Item(id='r{}'.format(j+1), attributes=a, text=t)
             for j, (a, t) in enumerate(block.get('lines', []))]
    tier = Tier(
        id='r',
        type='odin',
        attributes={'state': 'raw'},
        items=items
    )
    return tier
Esempio n. 7
0
def make_igt_raw_tier(block, options):
    items = []
    for j, linedata in enumerate(block.get('lines', [])):
        text = replace_invalid_xml_chars(linedata.get('content', ''),
                                         options['replacement_char'])
        attrs = linedata.copy()
        del attrs['content']
        items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text))
    tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items)
    return tier
Esempio n. 8
0
def default_decode_item(elem):
    ns, tag = _qname_split(elem.tag)
    assert tag == 'item'
    item = Item(id=elem.get('id'),
                type=elem.get('type'),
                attributes=get_attributes(elem, ignore=('id', 'type')),
                text=elem.text,
                namespace=ns,
                nsmap=elem.attrib.nsmap)
    elem.clear()
    return item
Esempio n. 9
0
def make_phrase_tier(tier_id, aln_tokens):
    return Tier(
        id=tier_id,
        type='phrases',
        items=[
            Item(
                id='{}1'.format(tier_id),
                text=' '.join(t for aln in aln_tokens for t in aln[1])
            )
        ]
    )
Esempio n. 10
0
    def setUp(self):
        # empty
        self.i1 = Item()

        # basic info
        self.i2 = Item(
            id='i2',
            type='basic',
            attributes={'attr':'val'},
            text='text'
        )

        # alignment and content refs
        self.i_ac = Item(
            id='i_ac',
            alignment='i2',
            content='i2[0:2]'
        )

        # segmentation ref
        self.i_s = Item(
            id='i_s',
            segmentation='i2[2:4]'
        )

        # override content ref with text
        self.i_t = Item(
            id='i_t',
            content='i2',
            text='something else'
        )

        # contextual structure
        self.t_a = Tier(id='t_a', items=[self.i2])
        self.t_b = Tier(id='t_b', items=[self.i_ac, self.i_t],
                        alignment='t_a', content='t_a')
        self.t_c = Tier(id='t_c', items=[self.i_s], segmentation='t_a')
        self.igt = Igt(tiers=[self.t_a, self.t_b, self.t_c])
        self.xc = XigtCorpus(igts=[self.igt])
Esempio n. 11
0
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aligned_tokens == [(None, None)]:
        pass  # nothing to do
    elif algn_tier is not None:
        attrs['alignment'] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aligned_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            assert tgt_tok == tgt_item.text  # FIXME is this necessary?
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={'alignment': tgt_item.id}))
                i += 1
    else:
        for tgt, src in aligned_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Esempio n. 12
0
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aln_tokens == [(None, None)]:
        pass  # nothing to do
    elif refattr is not None and algn_tier is not None:
        attrs[refattr] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aln_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={refattr:tgt_item.id})
                )
                i += 1
    else:
        for tgt, src in aln_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Esempio n. 13
0
    def test_get_attribute(self):
        i = Item(id='i1')
        assert i.get_attribute('attr') == None
        assert i.get_attribute('attr', 1) == 1
        i.attributes['attr'] = 'val'
        assert i.get_attribute('attr', 1) == 'val'
        assert i.get_attribute('abc', inherit=True) == None
        t = Tier(id='t', items=[i], attributes={'abc': 'def'})
        assert i.get_attribute('abc', inherit=True) == 'def'

        assert self.i1.get_attribute('attr') == None
        assert self.i1.get_attribute('attr', 1) == 1

        assert self.i2.get_attribute('attr') == 'val'
        assert self.i2.get_attribute('attr', 1) == 'val'

        assert self.i_ac.get_attribute('alignment') == 'i2'
Esempio n. 14
0
    def test_get_attribute(self):
        i = Item(id='i1')
        self.assertEqual(i.get_attribute('attr'), None)
        self.assertEqual(i.get_attribute('attr', 1), 1)
        i.attributes['attr'] = 'val'
        self.assertEqual(i.get_attribute('attr', 1), 'val')
        self.assertEqual(i.get_attribute('abc', inherit=True), None)
        t = Tier(id='t', items=[i], attributes={'abc': 'def'})
        self.assertEqual(i.get_attribute('abc', inherit=True), 'def')

        self.assertEqual(self.i1.get_attribute('attr'), None)
        self.assertEqual(self.i1.get_attribute('attr', 1), 1)

        self.assertEqual(self.i2.get_attribute('attr'), 'val')
        self.assertEqual(self.i2.get_attribute('attr', 1), 'val')

        self.assertEqual(self.i_ac.get_attribute('alignment'), 'i2')
Esempio n. 15
0
def remove_language_name(items, igt):
    new_items = []
    lgcode = xp.find(igt, LANG_CODE_PATH)
    lgname = xp.find(igt, LANG_NAME_PATH)
    lgtoks = []
    if lgcode and '?' not in lgcode and '*' not in lgcode:
        codes = set(lgcode.split(':'))  # split up complex codes
        codes.update(map(str.upper, list(codes)))
        codes.update(map(str.lower, list(codes)))
        lgtoks.extend(codes)
    if lgname and '?' not in lgname:
        lgtoks.append(lgname)
        lgtoks.append(lgname.upper())
        if re.search('[- ]', lgname, re.U):  # abbreviation for multiword names
            lgtoks.append(''.join(ln[0]
                                  for ln in re.split(r'[- ]+', lgname, re.U)))
        if re.search(r'^\w{3}', lgname, re.U):
            lgtoks.append(lgname[:3])
    if lgtoks:
        sig = '|'.join(re.escape(t) for t in lgtoks)
        start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U)
        end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U)
        for item in items:
            new_items.append(item)  # add now; might be modified later
            tags = get_tags(item)
            if tags[0] != 'M':
                orig = item.text
                m = start_lg_re.match(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    new_items.append(meta_item)
                    item.text = start_lg_re.sub(whitespace, item.text)
                m = end_lg_re.search(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    items.append(meta_item)
                    item.text = end_lg_re.sub(whitespace, item.text).rstrip()
                if 'LN' in tags and item.text != orig:
                    tags.remove('LN')
                    item.attributes['tag'] = '+'.join(tags)
    else:
        new_items = items
    return new_items
Esempio n. 16
0
    def test_resolve_ref(self):
        # item has no reference attribute
        b1 = Item(id='b1')
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # has a reference attribute, but is not contained by a tier
        b1.alignment = 'a1'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in tier, but tier has no reference attribute
        t_b = Tier(id='b', items=[b1])
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # tier has reference attribute, but is not contained by an Igt
        t_b.alignment = 'a'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in IGT, but referred tier doesn't exist
        igt = Igt(tiers=[t_b])
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred tier exists, but has no item referred by item's alignment
        t_a = Tier(id='a')
        igt.append(t_a)
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred item exists, but has no value (which resolves to '')
        a1 = Item(id='a1')
        t_a.append(a1)
        self.assertEqual(b1.resolve_ref('alignment'), '')
        # referred item has a value
        a1.text = 'text'
        self.assertEqual(b1.resolve_ref('alignment'), 'text')

        # stored item tests
        self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment')

        self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment')

        self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text')
        self.assertEqual(self.i_ac.resolve_ref('content'), 'te')

        self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt')

        self.assertEqual(self.i_t.resolve_ref('content'), 'text')
Esempio n. 17
0
class TestItem(unittest.TestCase):
    def setUp(self):
        # empty
        self.i1 = Item()

        # basic info
        self.i2 = Item(
            id='i2',
            type='basic',
            attributes={'attr':'val'},
            text='text'
        )

        # alignment and content refs
        self.i_ac = Item(
            id='i_ac',
            alignment='i2',
            content='i2[0:2]'
        )

        # segmentation ref
        self.i_s = Item(
            id='i_s',
            segmentation='i2[2:4]'
        )

        # override content ref with text
        self.i_t = Item(
            id='i_t',
            content='i2',
            text='something else'
        )

        # contextual structure
        self.t_a = Tier(id='t_a', items=[self.i2])
        self.t_b = Tier(id='t_b', items=[self.i_ac, self.i_t],
                        alignment='t_a', content='t_a')
        self.t_c = Tier(id='t_c', items=[self.i_s], segmentation='t_a')
        self.igt = Igt(tiers=[self.t_a, self.t_b, self.t_c])
        self.xc = XigtCorpus(igts=[self.igt])


    def test_init(self):
        self.assertRaises(ValueError, Item, id='1')  # invalid id

    def test_id(self):
        self.assertIs(self.i1.id, None)

        self.assertEqual(self.i2.id, 'i2')

        self.assertEqual(self.i_ac.id, 'i_ac')
        self.assertEqual(self.i_s.id, 'i_s')
        self.assertEqual(self.i_t.id, 'i_t')

    def test_type(self):
        self.assertIs(self.i1.type, None)

        self.assertEqual(self.i2.type, 'basic')

        self.assertIs(self.i_ac.type, None)
        self.assertIs(self.i_s.type, None)
        self.assertIs(self.i_t.type, None)

    def test_parents(self):
        self.assertIs(self.i1.tier, None)
        self.assertIs(self.i1.igt, None)
        self.assertIs(self.i1.corpus, None)

        self.assertIs(self.i2.tier, self.t_a)
        self.assertIs(self.i2.igt, self.igt)
        self.assertIs(self.i2.corpus, self.xc)

        self.assertEqual(self.i_ac.tier, self.t_b)
        self.assertEqual(self.i_ac.igt, self.igt)
        self.assertEqual(self.i_ac.corpus, self.xc)

        self.assertEqual(self.i_s.tier, self.t_c)
        self.assertEqual(self.i_s.igt, self.igt)
        self.assertEqual(self.i_s.corpus, self.xc)

        self.assertEqual(self.i_t.tier, self.t_b)
        self.assertEqual(self.i_t.igt, self.igt)
        self.assertEqual(self.i_t.corpus, self.xc)

    def test_attributes(self):
        self.assertEqual(self.i1.attributes, dict())

        self.assertEqual(self.i2.attributes, {'attr':'val'})

        self.assertEqual(self.i_ac.attributes,
                         {'alignment': 'i2', 'content': 'i2[0:2]'})
        self.assertEqual(self.i_s.attributes, {'segmentation': 'i2[2:4]'})
        self.assertEqual(self.i_t.attributes, {'content': 'i2'})

    def test_reference_attributes(self):
        # segmentation cannot co-occur with alignment or content
        self.assertRaises(XigtError, Item, alignment='a1', segmentation='b1')
        self.assertRaises(XigtError, Item, content='a1', segmentation='b1')

        self.assertIs(self.i1.alignment, None)
        self.assertIs(self.i1.content, None)
        self.assertIs(self.i1.segmentation, None)

        self.assertIs(self.i2.alignment, None)
        self.assertIs(self.i2.content, None)
        self.assertIs(self.i2.segmentation, None)

        self.assertEqual(self.i_ac.alignment, 'i2')
        self.assertEqual(self.i_ac.content, 'i2[0:2]')
        self.assertIs(self.i_ac.segmentation, None)

        self.assertIs(self.i_s.alignment, None)
        self.assertIs(self.i_s.content, None)
        self.assertEqual(self.i_s.segmentation, 'i2[2:4]')

        self.assertEqual(self.i_t.alignment, None)
        self.assertEqual(self.i_t.content, 'i2')
        self.assertEqual(self.i_t.segmentation, None)

    def test_text(self):
        self.assertIs(self.i1.text, None)

        self.assertEqual(self.i2.text, 'text')

        self.assertIs(self.i_ac.text, None)
        self.assertIs(self.i_s.text, None)
        self.assertEqual(self.i_t.text, 'something else')

    def test_value(self):
        self.assertIs(self.i1.value(), None)

        self.assertEqual(self.i2.value(), 'text')

        self.assertEqual(self.i_ac.value(), 'te')
        self.assertEqual(self.i_s.value(), 'xt')
        self.assertEqual(self.i_t.value(), 'something else')

    def test_resolve_ref(self):
        # item has no reference attribute
        b1 = Item(id='b1')
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # has a reference attribute, but is not contained by a tier
        b1.alignment = 'a1'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in tier, but tier has no reference attribute
        t_b = Tier(id='b', items=[b1])
        self.assertRaises(KeyError, b1.resolve_ref, 'alignment')
        # tier has reference attribute, but is not contained by an Igt
        t_b.alignment = 'a'
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # item in IGT, but referred tier doesn't exist
        igt = Igt(tiers=[t_b])
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred tier exists, but has no item referred by item's alignment
        t_a = Tier(id='a')
        igt.append(t_a)
        self.assertRaises(XigtStructureError, b1.resolve_ref, 'alignment')
        # referred item exists, but has no value (which resolves to '')
        a1 = Item(id='a1')
        t_a.append(a1)
        self.assertEqual(b1.resolve_ref('alignment'), '')
        # referred item has a value
        a1.text = 'text'
        self.assertEqual(b1.resolve_ref('alignment'), 'text')

        # stored item tests
        self.assertRaises(KeyError, self.i1.resolve_ref, 'alignment')

        self.assertRaises(KeyError, self.i2.resolve_ref, 'alignment')

        self.assertEqual(self.i_ac.resolve_ref('alignment'), 'text')
        self.assertEqual(self.i_ac.resolve_ref('content'), 'te')

        self.assertEqual(self.i_s.resolve_ref('segmentation'), 'xt')

        self.assertEqual(self.i_t.resolve_ref('content'), 'text')

    def test_span(self):
        # sub-spans of null content is also null content
        self.assertIs(self.i1.span(0,1), None)

        self.assertEqual(self.i2.span(0,1), 't')

        self.assertEqual(self.i_ac.span(1,2), 'e')
        self.assertEqual(self.i_s.span(1,2), 't')
        self.assertEqual(self.i_t.span(1,2), 'o')

    def test_get_attribute(self):
        i = Item(id='i1')
        self.assertEqual(i.get_attribute('attr'), None)
        self.assertEqual(i.get_attribute('attr', 1), 1)
        i.attributes['attr'] = 'val'
        self.assertEqual(i.get_attribute('attr', 1), 'val')
        self.assertEqual(i.get_attribute('abc', inherit=True), None)
        t = Tier(id='t', items=[i], attributes={'abc': 'def'})
        self.assertEqual(i.get_attribute('abc', inherit=True), 'def')

        self.assertEqual(self.i1.get_attribute('attr'), None)
        self.assertEqual(self.i1.get_attribute('attr', 1), 1)

        self.assertEqual(self.i2.get_attribute('attr'), 'val')
        self.assertEqual(self.i2.get_attribute('attr', 1), 'val')

        self.assertEqual(self.i_ac.get_attribute('alignment'), 'i2')
Esempio n. 18
0
def separate_secondary_translations(items):
    # sometimes translation lines with secondary translations are marked
    # as +DB even if they are for the same, single IGT
    for item in items:
        tags = get_tags(item)
        if tags[0] in ('L', 'G', 'L-G') and 'DB' in tags[1:]:
            # don't attempt
            return items
    indent = min_indent(items, tags=('L', 'G', 'L-G', 'L-G-T', 'G-T'))

    new_items = []
    for item in items:
        tags = get_tags(item)
        text = item.text
        if (tags[0] == 'T' and 'CR' not in tags[1:]):
            text = re.sub(
                r'([{cq}])\s*(\s|/)\s*([{oq}])'.format(oq=OPENQUOTES,
                                                       cq=CLOSEQUOTES),
                r'\1 \2 \3', text, re.I | re.U)
            matches = [
                m for m in basic_quoted_trans_re.finditer(text)
                if m.group('t').strip()
            ]
            sub_items = []
            if matches:
                pos = 0
                bare_T_seen = False
                last_i = len(matches) - 1
                for i, match in enumerate(matches):
                    start, end = match.start(), match.end()
                    t = match.group('t')
                    if i == last_i and re.search(r'\w|\d', text[end:], re.U):
                        t += text[match.end():]
                    pre = text[pos:match.start()]
                    # some instances have bad matches... try to avoid with
                    # a hard limit of 30 chars for the note or note is 2x
                    # size of t
                    prelen = len(pre.strip())
                    if prelen > 30 or prelen >= (2 * len(t.strip())):
                        sub_items = []
                        new_items.append(item)
                        break
                    new_tags = list(tags)
                    if re.search(r'lit(?:eral(?:ly)?)?', pre):
                        if 'LT' not in new_tags: new_tags.append('LT')
                    elif (re.search(r'(or|also|ii+|\b[bcd]\.)[ :,]', pre)
                          or bare_T_seen):
                        if 'AL' not in new_tags: new_tags.append('AL')
                    else:
                        bare_T_seen = True
                    attrs = dict(item.attributes)
                    if match.group('judg'):
                        attrs['judgment'] = match.group('judg')
                    if re.search(r'\w|\d', pre, re.U):
                        attrs['note'] = pre.strip()
                    attrs['tag'] = '+'.join(new_tags)
                    sub_items.append(
                        Item(id=item.id + '_{}'.format(i + 1),
                             attributes=attrs,
                             text=t))
                    pos = end
                new_items.extend(sub_items)
            else:
                new_items.append(item)
        else:
            new_items.append(item)
    return new_items