Esempio n. 1
0
def remove_language_name(items, igt):
    new_items = []
    lgcode = xp.find(igt, LANG_CODE_PATH)
    lgname = xp.find(igt, LANG_NAME_PATH)
    lgtoks = []
    if lgcode and '?' not in lgcode and '*' not in lgcode:
        codes = set(lgcode.split(':'))  # split up complex codes
        codes.update(map(str.upper, list(codes)))
        codes.update(map(str.lower, list(codes)))
        lgtoks.extend(codes)
    if lgname and '?' not in lgname:
        lgtoks.append(lgname)
        lgtoks.append(lgname.upper())
        if re.search('[- ]', lgname, re.U):  # abbreviation for multiword names
            lgtoks.append(''.join(ln[0]
                                  for ln in re.split(r'[- ]+', lgname, re.U)))
        if re.search(r'^\w{3}', lgname, re.U):
            lgtoks.append(lgname[:3])
    if lgtoks:
        sig = '|'.join(re.escape(t) for t in lgtoks)
        start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U)
        end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U)
        for item in items:
            new_items.append(item)  # add now; might be modified later
            tags = get_tags(item)
            if tags[0] != 'M':
                orig = item.text
                m = start_lg_re.match(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    new_items.append(meta_item)
                    item.text = start_lg_re.sub(whitespace, item.text)
                m = end_lg_re.search(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    items.append(meta_item)
                    item.text = end_lg_re.sub(whitespace, item.text).rstrip()
                if 'LN' in tags and item.text != orig:
                    tags.remove('LN')
                    item.attributes['tag'] = '+'.join(tags)
    else:
        new_items = items
    return new_items
Esempio n. 2
0
def copy_items(items):
    return [
        Item(id=item.id, type=item.type, alignment=item.alignment,
             content=item.content, segmentation=item.segmentation,
             attributes=item.attributes, text=item.text)
        for item in items
    ]
Esempio n. 3
0
def copy_items(items):
    return [
        Item(id=item.id,
             type=item.type,
             attributes=item.attributes,
             text=item.text) for item in items
    ]
Esempio n. 4
0
def make_igt_raw_tier(block, options):
    items = [Item(id='r{}'.format(j+1), attributes=a, text=t)
             for j, (a, t) in enumerate(block.get('lines', []))]
    tier = Tier(
        id='r',
        type='odin',
        attributes={'state': 'raw'},
        items=items
    )
    return tier
Esempio n. 5
0
def make_igt_raw_tier(block, options):
    items = []
    for j, linedata in enumerate(block.get('lines', [])):
        text = replace_invalid_xml_chars(linedata.get('content', ''),
                                         options['replacement_char'])
        attrs = linedata.copy()
        del attrs['content']
        items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text))
    tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items)
    return tier
Esempio n. 6
0
def make_phrase_tier(tier_id, aln_tokens):
    return Tier(
        id=tier_id,
        type='phrases',
        items=[
            Item(
                id='{}1'.format(tier_id),
                text=' '.join(t for aln in aln_tokens for t in aln[1])
            )
        ]
    )
Esempio n. 7
0
def default_decode_item(elem):
    ns, tag = _qname_split(elem.tag)
    assert tag == 'item'
    item = Item(id=elem.get('id'),
                type=elem.get('type'),
                attributes=get_attributes(elem, ignore=('id', 'type')),
                text=elem.text,
                namespace=ns,
                nsmap=elem.attrib.nsmap)
    elem.clear()
    return item
Esempio n. 8
0
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aligned_tokens == [(None, None)]:
        pass  # nothing to do
    elif algn_tier is not None:
        attrs['alignment'] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aligned_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            assert tgt_tok == tgt_item.text  # FIXME is this necessary?
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={'alignment': tgt_item.id}))
                i += 1
    else:
        for tgt, src in aligned_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Esempio n. 9
0
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aln_tokens == [(None, None)]:
        pass  # nothing to do
    elif refattr is not None and algn_tier is not None:
        attrs[refattr] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aln_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={refattr:tgt_item.id})
                )
                i += 1
    else:
        for tgt, src in aln_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Esempio n. 10
0
def remove_citations(items):
    def removable(m, t, i):
        # citation matches are removable if they don't look like
        # translation alternates or bracketed glosses
        if t in ('L', 'G'):
            start, end = m.span()
            other = None
            if t == 'L':  # look down then up for nearest G
                others = items[i + 1:] + items[i - 1::-1]
                t2 = 'G'
            else:  # look up then down for nearest L
                others = items[i - 1:] + items[i - 1::-1]
                t2 = 'L'
            other = next((i for i in others if get_tags(i)[0] == t2), None)
            if other and (other.text or '')[start:end].strip() != '':
                return False
        elif re.match(r'\s*[{}].*[{}]\s*$'.format(OPENQUOTES, CLOSEQUOTES),
                      m.group('inner1') or m.group('inner2'), re.U):
            return False
        return True

    new_items = []
    for i, item in enumerate(items):
        new_items.append(item)  # add now; text might be modified later
        tags = get_tags(item)
        if tags[0] not in ('L', 'G', 'T', 'L-G', 'L-T', 'L-G-T'):
            continue
        match = citation_re.search(item.text)
        if (match and removable(match, tags[0], i)):
            meta_item = Item(id=item.id,
                             text=match.group(0).strip(),
                             attributes=item.attributes)
            m_tags = ['M']
            item.text = citation_re.sub('', item.text).rstrip()
            if 'AC' in tags:
                tags.remove('AC')
                m_tags.append('AC')
            elif 'LN' in tags:
                tags.remove('LN')
                m_tags.append('LN')
            elif 'CN' in tags:
                tags.remove('CN')
                m_tags.append('CN')
            # what about other tags? LN, CN, EX
            item.attributes['tag'] = '+'.join(tags)
            meta_item.attributes['tag'] = '+'.join(m_tags)
            new_items.append(meta_item)
    return new_items
Esempio n. 11
0
def separate_secondary_translations(items):
    # sometimes translation lines with secondary translations are marked
    # as +DB even if they are for the same, single IGT
    for item in items:
        tags = get_tags(item)
        if tags[0] in ('L', 'G', 'L-G') and 'DB' in tags[1:]:
            # don't attempt
            return items
    indent = min_indent(items, tags=('L', 'G', 'L-G', 'L-G-T', 'G-T'))

    new_items = []
    for item in items:
        tags = get_tags(item)
        text = item.text
        if (tags[0] == 'T' and 'CR' not in tags[1:]):
            text = re.sub(
                r'([{cq}])\s*(\s|/)\s*([{oq}])'.format(oq=OPENQUOTES,
                                                       cq=CLOSEQUOTES),
                r'\1 \2 \3', text, re.I | re.U)
            matches = [
                m for m in basic_quoted_trans_re.finditer(text)
                if m.group('t').strip()
            ]
            sub_items = []
            if matches:
                pos = 0
                bare_T_seen = False
                last_i = len(matches) - 1
                for i, match in enumerate(matches):
                    start, end = match.start(), match.end()
                    t = match.group('t')
                    if i == last_i and re.search(r'\w|\d', text[end:], re.U):
                        t += text[match.end():]
                    pre = text[pos:match.start()]
                    # some instances have bad matches... try to avoid with
                    # a hard limit of 30 chars for the note or note is 2x
                    # size of t
                    prelen = len(pre.strip())
                    if prelen > 30 or prelen >= (2 * len(t.strip())):
                        sub_items = []
                        new_items.append(item)
                        break
                    new_tags = list(tags)
                    if re.search(r'lit(?:eral(?:ly)?)?', pre):
                        if 'LT' not in new_tags: new_tags.append('LT')
                    elif (re.search(r'(or|also|ii+|\b[bcd]\.)[ :,]', pre)
                          or bare_T_seen):
                        if 'AL' not in new_tags: new_tags.append('AL')
                    else:
                        bare_T_seen = True
                    attrs = dict(item.attributes)
                    if match.group('judg'):
                        attrs['judgment'] = match.group('judg')
                    if re.search(r'\w|\d', pre, re.U):
                        attrs['note'] = pre.strip()
                    attrs['tag'] = '+'.join(new_tags)
                    sub_items.append(
                        Item(id=item.id + '_{}'.format(i + 1),
                             attributes=attrs,
                             text=t))
                    pos = end
                new_items.extend(sub_items)
            else:
                new_items.append(item)
        else:
            new_items.append(item)
    return new_items