Beispiel #1
0
def merge_items(*items):
    alignment = ','.join(i.alignment for i in items if i.alignment)
    content = ','.join(i.content for i in items if i.content)
    segmentation = ','.join(i.segmentation for i in items if i.segmentation)

    if segmentation and (alignment or content):
        raise ValueError(
            'Cannot merge items defining segmentation and another '
            'reference attribute.')

    base = items[0]

    base.text = ' '.join(i.text for i in items)

    base.attributes['line'] = ' '.join(i.attributes['line'] for i in items)
    if alignment: base.alignment = alignment
    if content: base.content = content
    if segmentation: base.segmentation = segmentation

    pri_tags = set()
    sec_tags = set()
    for item in items:
        tags = get_tags(item)
        if tags[0]:
            pri_tags.add(tags[0])
        sec_tags.update(tags[1:])
    tag = '-'.join(sorted(pri_tags)).replace('G-L', 'L-G')
    if sec_tags:
        tag = '{}+{}'.format(tag, '+'.join(sorted(sec_tags)))
    base.attributes['tag'] = tag
Beispiel #2
0
def remove_example_numbers(items):
    # IGT-initial numbers (e.g. '1.' '(1)', '5a.', '(ii)')
    def removable(m):
        start, end = m.span()
        end -= 1  # ignore the required final space
        mtext = m.group('exnum')
        for item in items:
            tags = get_tags(item)
            if tags[0] not in ('L', 'G', 'T', 'L-G', 'G-T', 'L-T', 'L-G-T'):
                continue
            text = (item.text or '')[start:end]
            if text != mtext and text.strip() != '':
                return False
        return True

    for item in items:
        tags = get_tags(item)
        if tags[0] in ('L-G', 'L-T', 'G-T', 'L-G-T'):
            item.text = ex_num_re.sub(whitespace, item.text)
        elif tags[0] in ('L', 'G', 'T'):
            m = ex_num_re.match(item.text)
            while m and removable(m):
                item.text = ex_num_re.sub(whitespace, item.text)
                m = ex_num_re.match(item.text)
    return items
Beispiel #3
0
def normalize_items(base_tier, norm_id):
    # first make copies of the original items
    items = copy_items(base_tier.items)
    items = remove_blank_items(items)  # don't bother with blank lines
    items = rejoin_continuations(items)
    items = rejoin_translations(items)
    items = remove_citations(items)
    items = remove_language_name(items, base_tier.igt)
    items = remove_example_numbers(items)
    items = remove_blank_items(items)  # in case previous created blanks

    for item in items:
        # and set the copy's alignments to their current ID (changed later)
        item.alignment = item.id
        rejoin_hyphenated_grams(item)
        extract_judgment(item)
        # any remaining tag=B items should be changed to tag=M
        # (because they aren't blank)
        tags = get_tags(item)
        if tags[0] == 'B':
            tags = ['M'] + tags[1:]
            item.attributes['tag'] = '+'.join(tags)

    items = separate_secondary_translations(items)
    items = dewrap_lines(items)
    items = unquote_translations(items)
    items = shift_left(items, tags=('L', 'G', 'L-G', 'L-T', 'G-T', 'L-G-T'))

    for i, item in enumerate(items):
        item.id = '{}{}'.format(norm_id, i + 1)

    return items
Beispiel #4
0
def extract_judgment(item):
    tags = get_tags(item)
    if tags[0] == 'M' or 'CR' in tags:
        return
    match = re.match(r'^\s*([*?#]+)[^/]+$', item.text, re.U)
    if match:
        item.attributes['judgment'] = match.group(1)
    item.text = re.sub(r'^(\s*)[*?#]+\s*', r'\1', item.text, re.U)
Beispiel #5
0
def merge_lines(items):
    """
    Return the lines with corrupted and split lines merged.
    Merge corrupted lines if:
      * Both lines have the +CR tag
      * Both lines have one other tag in common
      * The lines are sequential
      * tokens in one line align to whitespace in the other
    TODO:
      * Can we allow some non-whitespace overlap, in which case the
        token would be inserted in the closest match
      * Can we recombine diacritics with the letter it came from?
        E.g. is this usually accents or umlauts on vowels?
      * Is there anything that can be done about intraline corruption?
        E.g. when spaces are inserted w i t h i n words
    """
    n = len(items)
    # nothing to do if there's just 1 line
    if n < 2:
        return items
    newitems = [items[0]]
    for i in range(1, n):
        # lines are pairs of attributes and content
        prev = newitems[-1]
        cur = items[i]
        p_tags = get_tags(prev)
        c_tags = get_tags(cur)
        # if no non-CR tags are shared
        if 'CR' not in c_tags or \
           len(set(p_tags).intersection(c_tags).difference(['CR'])) == 0:
            newitems.append(cur)
            continue
        merged = bit_merge(prev.text or '', cur.text or '')
        if merged is not None:
            # there's no OrderedSet, but OrderedDict will do
            tags = OrderedDict((t, 1) for t in p_tags + c_tags)
            del tags['CR']  # assume we fixed the problem?
            line_nums = ' '.join(
                [prev.attributes.get('line'),
                 cur.attributes.get('line')])
            prev.attributes['tag'] = '+'.join(tags)
            prev.attributes['line'] = line_nums
            prev.text = merged
    return newitems
Beispiel #6
0
def merge_lines(items):
    """
    Return the lines with corrupted and split lines merged.
    Merge corrupted lines if:
      * Both lines have the +CR tag
      * Both lines have one other tag in common
      * The lines are sequential
      * tokens in one line align to whitespace in the other
    TODO:
      * Can we allow some non-whitespace overlap, in which case the
        token would be inserted in the closest match
      * Can we recombine diacritics with the letter it came from?
        E.g. is this usually accents or umlauts on vowels?
      * Is there anything that can be done about intraline corruption?
        E.g. when spaces are inserted w i t h i n words
    """
    n = len(items)
    # nothing to do if there's just 1 line
    if n < 2:
        return items
    newitems = [items[0]]
    for i in range(1,n):
        # lines are pairs of attributes and content
        prev = newitems[-1]
        cur = items[i]
        p_tags = get_tags(prev)
        c_tags = get_tags(cur)
        # if no non-CR tags are shared
        if 'CR' not in c_tags or \
           len(set(p_tags).intersection(c_tags).difference(['CR'])) == 0:
            newitems.append(cur)
            continue
        merged = bit_merge(prev.text or '', cur.text or '')
        if merged is not None:
            # there's no OrderedSet, but OrderedDict will do
            tags = OrderedDict((t,1) for t in p_tags + c_tags)
            del tags['CR']  # assume we fixed the problem?
            line_nums = ' '.join([prev.attributes.get('line'),
                                  cur.attributes.get('line')])
            prev.attributes['tag'] = '+'.join(tags)
            prev.attributes['line'] = line_nums
            prev.text = merged
    return newitems
Beispiel #7
0
def unquote_translations(items):
    for item in items:
        tags = get_tags(item)
        if tags[0] == 'T':
            item.text = re.sub(r'^\s*[{}]?'.format(OPENQUOTES), '', item.text,
                               re.U)
            item.text = re.sub(r'[{}]\s*$'.format(CLOSEQUOTES), '', item.text,
                               re.U)

    return items
Beispiel #8
0
def rejoin_continuations(items):
    new_items = []
    for item in items:
        tags = get_tags(item)
        if tags[0] == 'C' and new_items:
            item.text = item.text.lstrip()
            item.attributes['tag'] = item.attributes['tag'][1:]  # remove C
            merge_items(new_items[-1], item)
        else:
            new_items.append(item)
    return new_items
Beispiel #9
0
def dewrap_lines(items):
    # look for patterns like L G L G and join them to L G
    # then look for T T and join them to T if they don't look like alternates
    unwrapped = []
    used = set()
    sig = []
    for item in items:
        tags = get_tags(item)
        if tags[0] in ('L', 'G', 'T'):
            sig.append(item.attributes['tag'])
    sig = ' '.join(sig)
    if (any(x in sig for x in ('L G L G ', 'L G T L G T', 'G G ', 'L L '))
            and not any(x in sig for x in ('L+', 'L-', 'G+', 'G-'))):
        # likely patterns for wrapping without other noise
        ls = [item for item in items if item.attributes.get('tag') == 'L']
        gs = [item for item in items if item.attributes.get('tag') == 'G']
        for l_, g_ in zip_longest(ls, gs):
            if l_ is not None and g_ is not None:
                maxlen = max([len(l_.text), len(g_.text)])
                l_.text = l_.text.ljust(maxlen)
                g_.text = g_.text.ljust(maxlen)
        if ls:
            merge_items(*ls)
            unwrapped.append(ls[0])
        if gs:
            merge_items(*gs)
            unwrapped.append(gs[0])
        used.update(id(x) for x in ls + gs)
    # add everything unused up to the first translation
    for item in items:
        if item.attributes.get('tag') in ('T', 'T+AC'):
            break
        elif id(item) not in used:
            unwrapped.append(item)
            used.add(id(item))
    # now do translations
    if (any(x in sig for x in ('L G T L G T', 'T T+AC', 'T T+LN', 'T T'))
            and not any(x in sig for x in ('+EX', '+LT', '+AL', 'T+CR'))):
        # translations that appear wrapped and not alternates
        ts = [
            item for item in items
            if item.attributes.get('tag') in ('T', 'T+AC', 'T+LN')
        ]
        if ts:
            merge_items(*ts)
            unwrapped.append(ts[0])
        used.update(id(x) for x in ts)
    # finally add anything unused
    for item in items:
        if id(item) not in used:
            unwrapped.append(item)
            used.add(id(item))
    return unwrapped
Beispiel #10
0
 def removable(m):
     start, end = m.span()
     end -= 1  # ignore the required final space
     mtext = m.group('exnum')
     for item in items:
         tags = get_tags(item)
         if tags[0] not in ('L', 'G', 'T', 'L-G', 'G-T', 'L-T', 'L-G-T'):
             continue
         text = (item.text or '')[start:end]
         if text != mtext and text.strip() != '':
             return False
     return True
Beispiel #11
0
def remove_language_name(items, igt):
    new_items = []
    lgcode = xp.find(igt, LANG_CODE_PATH)
    lgname = xp.find(igt, LANG_NAME_PATH)
    lgtoks = []
    if lgcode and '?' not in lgcode and '*' not in lgcode:
        codes = set(lgcode.split(':'))  # split up complex codes
        codes.update(map(str.upper, list(codes)))
        codes.update(map(str.lower, list(codes)))
        lgtoks.extend(codes)
    if lgname and '?' not in lgname:
        lgtoks.append(lgname)
        lgtoks.append(lgname.upper())
        if re.search('[- ]', lgname, re.U):  # abbreviation for multiword names
            lgtoks.append(''.join(ln[0]
                                  for ln in re.split(r'[- ]+', lgname, re.U)))
        if re.search(r'^\w{3}', lgname, re.U):
            lgtoks.append(lgname[:3])
    if lgtoks:
        sig = '|'.join(re.escape(t) for t in lgtoks)
        start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U)
        end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U)
        for item in items:
            new_items.append(item)  # add now; might be modified later
            tags = get_tags(item)
            if tags[0] != 'M':
                orig = item.text
                m = start_lg_re.match(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    new_items.append(meta_item)
                    item.text = start_lg_re.sub(whitespace, item.text)
                m = end_lg_re.search(item.text)
                if m:
                    meta_item = Item(id=item.id,
                                     text=m.group(0).strip(),
                                     attributes=dict(item.attributes))
                    meta_item.attributes['tag'] = 'M+LN'
                    items.append(meta_item)
                    item.text = end_lg_re.sub(whitespace, item.text).rstrip()
                if 'LN' in tags and item.text != orig:
                    tags.remove('LN')
                    item.attributes['tag'] = '+'.join(tags)
    else:
        new_items = items
    return new_items
Beispiel #12
0
def remove_citations(items):
    def removable(m, t, i):
        # citation matches are removable if they don't look like
        # translation alternates or bracketed glosses
        if t in ('L', 'G'):
            start, end = m.span()
            other = None
            if t == 'L':  # look down then up for nearest G
                others = items[i + 1:] + items[i - 1::-1]
                t2 = 'G'
            else:  # look up then down for nearest L
                others = items[i - 1:] + items[i - 1::-1]
                t2 = 'L'
            other = next((i for i in others if get_tags(i)[0] == t2), None)
            if other and (other.text or '')[start:end].strip() != '':
                return False
        elif re.match(r'\s*[{}].*[{}]\s*$'.format(OPENQUOTES, CLOSEQUOTES),
                      m.group('inner1') or m.group('inner2'), re.U):
            return False
        return True

    new_items = []
    for i, item in enumerate(items):
        new_items.append(item)  # add now; text might be modified later
        tags = get_tags(item)
        if tags[0] not in ('L', 'G', 'T', 'L-G', 'L-T', 'L-G-T'):
            continue
        match = citation_re.search(item.text)
        if (match and removable(match, tags[0], i)):
            meta_item = Item(id=item.id,
                             text=match.group(0).strip(),
                             attributes=item.attributes)
            m_tags = ['M']
            item.text = citation_re.sub('', item.text).rstrip()
            if 'AC' in tags:
                tags.remove('AC')
                m_tags.append('AC')
            elif 'LN' in tags:
                tags.remove('LN')
                m_tags.append('LN')
            elif 'CN' in tags:
                tags.remove('CN')
                m_tags.append('CN')
            # what about other tags? LN, CN, EX
            item.attributes['tag'] = '+'.join(tags)
            meta_item.attributes['tag'] = '+'.join(m_tags)
            new_items.append(meta_item)
    return new_items
Beispiel #13
0
def rejoin_hyphenated_grams(item):
    # there may be morphemes separated by hyphens, but with intervening
    # spaces; slide the token over (e.g. "dog-  NOM" => "dog-NOM  ")
    tags = get_tags(item)
    delims = {'L': '-=', 'L-G': '-=', 'G': '-=.'}
    if tags[0] in delims:
        pattern = r'(\S*(?:\s*[{}]\s*\S*)+)'.format(delims[tags[0]])
        text = item.text
        toks = []
        pos = 0
        for match in list(re.finditer(pattern, text, re.U)):
            start, end = match.span()
            toks.append(text[pos:start])
            toks.append(text[start:end].replace(' ', ''))
            pos = end
        toks.append(text[pos:len(text)])
        item.text = ''.join(toks).rstrip()
Beispiel #14
0
 def removable(m, t, i):
     # citation matches are removable if they don't look like
     # translation alternates or bracketed glosses
     if t in ('L', 'G'):
         start, end = m.span()
         other = None
         if t == 'L':  # look down then up for nearest G
             others = items[i + 1:] + items[i - 1::-1]
             t2 = 'G'
         else:  # look up then down for nearest L
             others = items[i - 1:] + items[i - 1::-1]
             t2 = 'L'
         other = next((i for i in others if get_tags(i)[0] == t2), None)
         if other and (other.text or '')[start:end].strip() != '':
             return False
     elif re.match(r'\s*[{}].*[{}]\s*$'.format(OPENQUOTES, CLOSEQUOTES),
                   m.group('inner1') or m.group('inner2'), re.U):
         return False
     return True
Beispiel #15
0
def rejoin_translations(items):
    # rejoin translation lines if they don't start with some kind of
    # speaker indicator, quote, or other
    new_items = []
    prev_is_t = False
    prev_end = False
    for item in items:
        tags = get_tags(item)
        is_t = tags[0] == 'T' and 'DB' not in tags and 'CR' not in tags
        marked = re.match(r'^\s*[(\[]?\s*\S+\s*\.?\s*[)\]]?\s*:', item.text,
                          re.U)
        if prev_is_t and is_t and not marked and not prev_end:
            item.text = item.text.lstrip()
            merge_items(new_items[-1], item)
        else:
            new_items.append(item)
            prev_is_t = is_t
        end_match = re.search(r'[{}] *\)* *$'.format(CLOSEQUOTES), item.text)
        prev_end = end_match is not None
    return new_items
Beispiel #16
0
def separate_secondary_translations(items):
    # sometimes translation lines with secondary translations are marked
    # as +DB even if they are for the same, single IGT
    for item in items:
        tags = get_tags(item)
        if tags[0] in ('L', 'G', 'L-G') and 'DB' in tags[1:]:
            # don't attempt
            return items
    indent = min_indent(items, tags=('L', 'G', 'L-G', 'L-G-T', 'G-T'))

    new_items = []
    for item in items:
        tags = get_tags(item)
        text = item.text
        if (tags[0] == 'T' and 'CR' not in tags[1:]):
            text = re.sub(
                r'([{cq}])\s*(\s|/)\s*([{oq}])'.format(oq=OPENQUOTES,
                                                       cq=CLOSEQUOTES),
                r'\1 \2 \3', text, re.I | re.U)
            matches = [
                m for m in basic_quoted_trans_re.finditer(text)
                if m.group('t').strip()
            ]
            sub_items = []
            if matches:
                pos = 0
                bare_T_seen = False
                last_i = len(matches) - 1
                for i, match in enumerate(matches):
                    start, end = match.start(), match.end()
                    t = match.group('t')
                    if i == last_i and re.search(r'\w|\d', text[end:], re.U):
                        t += text[match.end():]
                    pre = text[pos:match.start()]
                    # some instances have bad matches... try to avoid with
                    # a hard limit of 30 chars for the note or note is 2x
                    # size of t
                    prelen = len(pre.strip())
                    if prelen > 30 or prelen >= (2 * len(t.strip())):
                        sub_items = []
                        new_items.append(item)
                        break
                    new_tags = list(tags)
                    if re.search(r'lit(?:eral(?:ly)?)?', pre):
                        if 'LT' not in new_tags: new_tags.append('LT')
                    elif (re.search(r'(or|also|ii+|\b[bcd]\.)[ :,]', pre)
                          or bare_T_seen):
                        if 'AL' not in new_tags: new_tags.append('AL')
                    else:
                        bare_T_seen = True
                    attrs = dict(item.attributes)
                    if match.group('judg'):
                        attrs['judgment'] = match.group('judg')
                    if re.search(r'\w|\d', pre, re.U):
                        attrs['note'] = pre.strip()
                    attrs['tag'] = '+'.join(new_tags)
                    sub_items.append(
                        Item(id=item.id + '_{}'.format(i + 1),
                             attributes=attrs,
                             text=t))
                    pos = end
                new_items.extend(sub_items)
            else:
                new_items.append(item)
        else:
            new_items.append(item)
    return new_items