def remove_language_name(items, igt): new_items = [] lgcode = xp.find(igt, LANG_CODE_PATH) lgname = xp.find(igt, LANG_NAME_PATH) lgtoks = [] if lgcode and '?' not in lgcode and '*' not in lgcode: codes = set(lgcode.split(':')) # split up complex codes codes.update(map(str.upper, list(codes))) codes.update(map(str.lower, list(codes))) lgtoks.extend(codes) if lgname and '?' not in lgname: lgtoks.append(lgname) lgtoks.append(lgname.upper()) if re.search('[- ]', lgname, re.U): # abbreviation for multiword names lgtoks.append(''.join(ln[0] for ln in re.split(r'[- ]+', lgname, re.U))) if re.search(r'^\w{3}', lgname, re.U): lgtoks.append(lgname[:3]) if lgtoks: sig = '|'.join(re.escape(t) for t in lgtoks) start_lg_re = re.compile(r'^\s*[(\[]?({})[)\]]?'.format(sig), re.U) end_lg_re = re.compile(r'[(\[]?({})[)\]]?\s*$'.format(sig), re.U) for item in items: new_items.append(item) # add now; might be modified later tags = get_tags(item) if tags[0] != 'M': orig = item.text m = start_lg_re.match(item.text) if m: meta_item = Item(id=item.id, text=m.group(0).strip(), attributes=dict(item.attributes)) meta_item.attributes['tag'] = 'M+LN' new_items.append(meta_item) item.text = start_lg_re.sub(whitespace, item.text) m = end_lg_re.search(item.text) if m: meta_item = Item(id=item.id, text=m.group(0).strip(), attributes=dict(item.attributes)) meta_item.attributes['tag'] = 'M+LN' items.append(meta_item) item.text = end_lg_re.sub(whitespace, item.text).rstrip() if 'LN' in tags and item.text != orig: tags.remove('LN') item.attributes['tag'] = '+'.join(tags) else: new_items = items return new_items
def copy_items(items): return [ Item(id=item.id, type=item.type, alignment=item.alignment, content=item.content, segmentation=item.segmentation, attributes=item.attributes, text=item.text) for item in items ]
def copy_items(items): return [ Item(id=item.id, type=item.type, attributes=item.attributes, text=item.text) for item in items ]
def make_igt_raw_tier(block, options): items = [Item(id='r{}'.format(j+1), attributes=a, text=t) for j, (a, t) in enumerate(block.get('lines', []))] tier = Tier( id='r', type='odin', attributes={'state': 'raw'}, items=items ) return tier
def make_igt_raw_tier(block, options): items = [] for j, linedata in enumerate(block.get('lines', [])): text = replace_invalid_xml_chars(linedata.get('content', ''), options['replacement_char']) attrs = linedata.copy() del attrs['content'] items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text)) tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items) return tier
def make_phrase_tier(tier_id, aln_tokens): return Tier( id=tier_id, type='phrases', items=[ Item( id='{}1'.format(tier_id), text=' '.join(t for aln in aln_tokens for t in aln[1]) ) ] )
def default_decode_item(elem): ns, tag = _qname_split(elem.tag) assert tag == 'item' item = Item(id=elem.get('id'), type=elem.get('type'), attributes=get_attributes(elem, ignore=('id', 'type')), text=elem.text, namespace=ns, nsmap=elem.attrib.nsmap) elem.clear() return item
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aligned_tokens == [(None, None)]: pass # nothing to do elif algn_tier is not None: attrs['alignment'] = algn_tier.id algn_data = zip_longest(algn_tier.items, aligned_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data assert tgt_tok == tgt_item.text # FIXME is this necessary? for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={'alignment': tgt_item.id})) i += 1 else: for tgt, src in aligned_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier): attrs = OrderedDict() items = list() i = 1 # start indices at 1 if aln_tokens == [(None, None)]: pass # nothing to do elif refattr is not None and algn_tier is not None: attrs[refattr] = algn_tier.id algn_data = zip_longest(algn_tier.items, aln_tokens) for tgt_item, src_data in algn_data: tgt_tok, src_toks = src_data for s in src_toks: items.append( Item(id='{}{}'.format(tier_id, i), text=s, attributes={refattr:tgt_item.id}) ) i += 1 else: for tgt, src in aln_tokens: for s in src: items.append(Item(id='{}{}'.format(tier_id, i), text=s)) i += 1 return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
def remove_citations(items): def removable(m, t, i): # citation matches are removable if they don't look like # translation alternates or bracketed glosses if t in ('L', 'G'): start, end = m.span() other = None if t == 'L': # look down then up for nearest G others = items[i + 1:] + items[i - 1::-1] t2 = 'G' else: # look up then down for nearest L others = items[i - 1:] + items[i - 1::-1] t2 = 'L' other = next((i for i in others if get_tags(i)[0] == t2), None) if other and (other.text or '')[start:end].strip() != '': return False elif re.match(r'\s*[{}].*[{}]\s*$'.format(OPENQUOTES, CLOSEQUOTES), m.group('inner1') or m.group('inner2'), re.U): return False return True new_items = [] for i, item in enumerate(items): new_items.append(item) # add now; text might be modified later tags = get_tags(item) if tags[0] not in ('L', 'G', 'T', 'L-G', 'L-T', 'L-G-T'): continue match = citation_re.search(item.text) if (match and removable(match, tags[0], i)): meta_item = Item(id=item.id, text=match.group(0).strip(), attributes=item.attributes) m_tags = ['M'] item.text = citation_re.sub('', item.text).rstrip() if 'AC' in tags: tags.remove('AC') m_tags.append('AC') elif 'LN' in tags: tags.remove('LN') m_tags.append('LN') elif 'CN' in tags: tags.remove('CN') m_tags.append('CN') # what about other tags? LN, CN, EX item.attributes['tag'] = '+'.join(tags) meta_item.attributes['tag'] = '+'.join(m_tags) new_items.append(meta_item) return new_items
def separate_secondary_translations(items): # sometimes translation lines with secondary translations are marked # as +DB even if they are for the same, single IGT for item in items: tags = get_tags(item) if tags[0] in ('L', 'G', 'L-G') and 'DB' in tags[1:]: # don't attempt return items indent = min_indent(items, tags=('L', 'G', 'L-G', 'L-G-T', 'G-T')) new_items = [] for item in items: tags = get_tags(item) text = item.text if (tags[0] == 'T' and 'CR' not in tags[1:]): text = re.sub( r'([{cq}])\s*(\s|/)\s*([{oq}])'.format(oq=OPENQUOTES, cq=CLOSEQUOTES), r'\1 \2 \3', text, re.I | re.U) matches = [ m for m in basic_quoted_trans_re.finditer(text) if m.group('t').strip() ] sub_items = [] if matches: pos = 0 bare_T_seen = False last_i = len(matches) - 1 for i, match in enumerate(matches): start, end = match.start(), match.end() t = match.group('t') if i == last_i and re.search(r'\w|\d', text[end:], re.U): t += text[match.end():] pre = text[pos:match.start()] # some instances have bad matches... try to avoid with # a hard limit of 30 chars for the note or note is 2x # size of t prelen = len(pre.strip()) if prelen > 30 or prelen >= (2 * len(t.strip())): sub_items = [] new_items.append(item) break new_tags = list(tags) if re.search(r'lit(?:eral(?:ly)?)?', pre): if 'LT' not in new_tags: new_tags.append('LT') elif (re.search(r'(or|also|ii+|\b[bcd]\.)[ :,]', pre) or bare_T_seen): if 'AL' not in new_tags: new_tags.append('AL') else: bare_T_seen = True attrs = dict(item.attributes) if match.group('judg'): attrs['judgment'] = match.group('judg') if re.search(r'\w|\d', pre, re.U): attrs['note'] = pre.strip() attrs['tag'] = '+'.join(new_tags) sub_items.append( Item(id=item.id + '_{}'.format(i + 1), attributes=attrs, text=t)) pos = end new_items.extend(sub_items) else: new_items.append(item) else: new_items.append(item) return new_items