def get_merged_begin_inside_tags_of_same_tag_value( tags: Optional[List[Optional[str]]]) -> List[Optional[str]]: if not tags: return [] prefix, tag_value = split_tag_prefix(tags[0]) if not prefix: return tags return (tags[:1] + [add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)] * (len(tags) - 1))
def to_begin_inside_tags(tag: Optional[str], length: int) -> List[Optional[str]]: if not length: return [] prefix, tag_value = split_tag_prefix(tag) if not prefix: return [tag] * length return ([add_tag_prefix(tag_value, prefix=B_TAG_PREFIX)] + [add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)] * (length - 1))
def get_suffix_extended_token_tags( token_tags: List[str], token_texts: List[str], enabled_tags: Set[str], token_whitespaces: Optional[List[str]] = None) -> List[Optional[str]]: result: List[Optional[str]] = [] if token_whitespaces is None: token_whitespaces = [' '] * len(token_texts) grouped_token_tags: List[List[Tuple[str, str, Optional[str]]]] = [ list(group) for _, group in groupby( zip(token_tags, token_texts, token_whitespaces), key=lambda pair: strip_tag_prefix(pair[0]) ) ] LOGGER.debug('suffix grouped_token_tags=%s', grouped_token_tags) for index, group in enumerate(grouped_token_tags): LOGGER.debug('suffix group: unpacked=%s', group) group_tags: List[str] group_texts: List[str] group_whitespaces: Optional[List[str]] group_tags, group_texts, group_whitespaces = zip(*group) # type: ignore LOGGER.debug( 'suffix group: tags=%s, texts=%s, whitespace=%s', group_tags, group_texts, group_whitespaces ) first_group_tag = group_tags[0] prev_group = grouped_token_tags[index - 1] if index > 0 else None first_prev_tag: Optional[str] = get_safe(get_safe(prev_group, 0), 0) _, first_prev_tag_value = split_tag_prefix(first_prev_tag) if first_group_tag or first_prev_tag_value not in enabled_tags: result.extend(group_tags) continue joined_text = JoinedText(group_texts, sep=' ', whitespace_list=group_whitespaces) m = re.search(r'^\.', str(joined_text)) LOGGER.debug('suffix match: %s', m) if not m: result.extend(group_tags) continue LOGGER.debug('suffix match end: %s (%r)', m.end(), str(joined_text)[:m.end()]) matching_tokens = list(joined_text.iter_items_and_index_range_between( (0, m.end()) )) LOGGER.debug('suffix matching_tokens: %s', matching_tokens) if not matching_tokens: result.extend(group_tags) continue unmatched_token_count = len(group_tags) - len(matching_tokens) result.extend([to_inside_tag(first_prev_tag)] * len(matching_tokens)) result.extend([None] * unmatched_token_count) LOGGER.debug('suffix result: %s', result) return result
def _preserve_current_tags(self): rev_tag_to_tei_path_mapping = { v: k for k, v in self._tag_to_tei_path_mapping.items() } LOGGER.debug( 'preserving tei tags using rev_tag_to_tei_path_mapping: %s', rev_tag_to_tei_path_mapping) for line in self._lines: for token in line.tokens: for level in (None, SUB_LEVEL): full_existing_tag = self.get_tag(token, level=level) prefix, existing_tag = split_tag_prefix(full_existing_tag) mapped_tag = add_tag_prefix( rev_tag_to_tei_path_mapping.get( existing_tag, existing_tag), prefix=prefix) self._set_preserved_tag(token, mapped_tag, level=level)
def _iter_group_tokens_by_tag_entity( structured_document: AbstractStructuredDocument, tokens: Iterable[T]) -> Iterable[Tuple[Optional[str], List[T]]]: pending_tag_value = None pending_tokens = None for token in tokens: current_full_tag = structured_document.get_tag(token) current_tag_prefix, current_tag_value = split_tag_prefix(current_full_tag) if ( pending_tokens and ( pending_tag_value != current_tag_value or current_tag_prefix == B_TAG_PREFIX ) ): yield pending_tag_value, pending_tokens pending_tokens = None if not pending_tokens: pending_tag_value = current_tag_value pending_tokens = [token] continue pending_tokens.append(token) if pending_tokens: yield pending_tag_value, pending_tokens
def _map_tag(tag: str, tag_map: Dict[str, str]) -> str: prefix, tag_value = split_tag_prefix(tag) return add_tag_prefix( tag=tag_map.get(tag_value, tag_value) if tag_value else tag_value, prefix=prefix )
def get_prefix_extended_token_tags( token_tags: List[str], token_texts: List[str], prefix_regex_by_tag_map: Dict[str, str], token_whitespaces: List[str] = None, enabled_tags: Set[str] = None) -> List[Optional[str]]: result: List[Optional[str]] = [] if token_whitespaces is None: token_whitespaces = [' '] * len(token_texts) _enabled_tags = ( enabled_tags if enabled_tags is not None else prefix_regex_by_tag_map.keys() ) grouped_token_tags: List[List[Tuple[Optional[str], str, Optional[str]]]] = [ list(group) for _, group in groupby( zip(token_tags, token_texts, token_whitespaces), key=lambda pair: strip_tag_prefix(pair[0]) ) ] LOGGER.debug('grouped_token_tags=%s', grouped_token_tags) for index, group in enumerate(grouped_token_tags): LOGGER.debug('group: unpacked=%s', group) group_tags: List[str] group_texts: List[str] group_whitespaces: Optional[List[str]] group_tags, group_texts, group_whitespaces = zip(*group) # type: ignore LOGGER.debug( 'group: tags=%s, texts=%s, whitespace=%s', group_tags, group_texts, group_whitespaces ) first_group_tag = group_tags[0] next_group = grouped_token_tags[index + 1] if index + 1 < len(grouped_token_tags) else None first_next_tag = get_safe(get_safe(next_group, 0), 0) first_next_prefix, first_next_tag_value = split_tag_prefix(first_next_tag) if first_group_tag or first_next_tag_value not in _enabled_tags: result.extend(group_tags) continue assert first_next_tag_value is not None joined_text = JoinedText(group_texts, sep=' ', whitespace_list=group_whitespaces) prefix_regex = prefix_regex_by_tag_map[first_next_tag_value] m = re.search(prefix_regex, str(joined_text)) LOGGER.debug('m: %s', m) if not m: result.extend(group_tags) continue LOGGER.debug('start: %s (%r)', m.start(), str(joined_text)[m.start():]) matching_tokens = list(joined_text.iter_items_and_index_range_between( (m.start(), len(str(joined_text))) )) LOGGER.debug('matching_tokens: %s', matching_tokens) if not matching_tokens: result.extend(group_tags) continue unmatched_token_count = len(group_tags) - len(matching_tokens) result.extend([None] * unmatched_token_count) result.extend([first_next_tag]) result.extend([to_inside_tag(first_next_tag)] * (len(matching_tokens) - 1)) if first_next_prefix == B_TAG_PREFIX: assert next_group is not None next_group[0] = ( to_inside_tag(first_next_tag), *next_group[0][1:] ) LOGGER.debug('result: %s', result) return result
def get_extended_line_token_tags( line_token_tags: Sequence[Optional[str]], extend_to_line_enabled_map: Dict[str, bool] = None, merge_enabled_map: Dict[str, bool] = None, default_extend_to_line_enabled: bool = DEFAULT_EXTEND_TO_LINE_ENABLED, default_merge_enabled: bool = DEFAULT_MERGE_ENABLED ) -> List[Optional[str]]: if extend_to_line_enabled_map is None: extend_to_line_enabled_map = {} if merge_enabled_map is None: merge_enabled_map = {} LOGGER.debug( 'line_token_tags: %s (extend_to_line_enabled_map: %s, merge_enabled_map: %s)', line_token_tags, extend_to_line_enabled_map, merge_enabled_map) grouped_token_tags: List[List[Optional[str]]] = [ list(group) for _, group in groupby(line_token_tags, key=strip_tag_prefix) ] grouped_token_tags = [ cast(List[Optional[str]], (get_merged_begin_inside_tags_of_same_tag_value(group) if merge_enabled_map.get(strip_tag_prefix(group[0]), default_merge_enabled) else group)) for group in grouped_token_tags ] LOGGER.debug('grouped_token_tags: %s', grouped_token_tags) result: List[Optional[str]] = [] for index, group in enumerate(grouped_token_tags): prev_group = grouped_token_tags[index - 1] if index > 0 else None next_group = grouped_token_tags[ index + 1] if index + 1 < len(grouped_token_tags) else None _, last_prev_tag_value = split_tag_prefix(get_safe(prev_group, -1)) first_next_prefix, first_next_tag_value = split_tag_prefix( get_safe(next_group, 0)) LOGGER.debug('group: %s', group) if group[0]: result.extend(group) elif prev_group and next_group: if (last_prev_tag_value == first_next_tag_value and get_dict_safe(merge_enabled_map, last_prev_tag_value, default_merge_enabled)): result.extend([to_inside_tag(prev_group[-1])] * len(group)) if first_next_prefix == B_TAG_PREFIX: next_group[0] = to_inside_tag(next_group[0]) else: result.extend(group) elif (prev_group and not get_dict_safe( extend_to_line_enabled_map, last_prev_tag_value, default_extend_to_line_enabled)): result.extend(group) elif (next_group and not get_dict_safe( extend_to_line_enabled_map, first_next_tag_value, default_extend_to_line_enabled)): result.extend(group) elif prev_group and len(prev_group) > len(group): result.extend([to_inside_tag(prev_group[-1])] * len(group)) elif next_group and len(next_group) > len(group): result.extend(to_begin_inside_tags(next_group[0], len(group))) if first_next_prefix == B_TAG_PREFIX: next_group[0] = to_inside_tag(next_group[0]) else: result.extend(group) LOGGER.debug('result: %s', result) return result
def to_inside_tag(tag: Optional[str]) -> Optional[str]: prefix, tag_value = split_tag_prefix(tag) return (add_tag_prefix(tag_value, prefix=I_TAG_PREFIX) if prefix == B_TAG_PREFIX else tag)
def to_begin_tag(tag: str) -> str: prefix, tag_value = split_tag_prefix(tag) return (add_tag_prefix(tag_value, prefix=B_TAG_PREFIX) if prefix == I_TAG_PREFIX else tag)
def _lines_to_tei(parent: etree.Element, lines: List[TeiLine], tag_to_tei_path_mapping: Dict[str, str] = None): if tag_to_tei_path_mapping is None: tag_to_tei_path_mapping = {} writer = XmlTreeWriter(parent) pending_space_tokens = [] pending_reset_tag_values = set() for line_index, line in enumerate(lines): if line_index: writer.append(E(TeiTagNames.LB)) for token in line.tokens: main_full_tag = token.attrib.get(TAG_ATTRIB_NAME) if not main_full_tag: main_full_tag = token.attrib.get(PRESERVED_TAG_ATTRIB_NAME) sub_full_tag = token.attrib.get(SUB_TAG_ATTRIB_NAME) if not sub_full_tag: sub_full_tag = token.attrib.get(PRESERVED_SUB_TAG_ATTRIB_NAME) main_prefix, main_tag = split_tag_prefix(main_full_tag) sub_prefix, sub_tag = split_tag_prefix(sub_full_tag) if not token.stripped_text: if main_prefix == B_TAG_PREFIX: LOGGER.debug('adding to pending reset tags, for %s', token) pending_reset_tag_values.add(main_tag) pending_space_tokens.append(token) continue main_required_path = _get_tag_required_path( main_tag, tag_to_tei_path_mapping) sub_required_path = (_get_tag_required_path( sub_tag, tag_to_tei_path_mapping) if sub_full_tag else []) if sub_full_tag and not _path_starts_with(main_required_path, sub_required_path): LOGGER.debug('ignoring sub tag outside main path: %s (%s)', sub_tag, sub_required_path) sub_tag = None sub_full_tag = None sub_required_path = [] LOGGER.debug( 'output token: %s (main_required_path: %s, sub_required_path: %s)', token, main_required_path, sub_required_path) if main_prefix == B_TAG_PREFIX: main_parent_path = main_required_path[:-1] LOGGER.debug( 'found begin prefix, resetting path to parent %s, for %s', main_parent_path, main_full_tag) writer.require_path(main_parent_path, token=token) elif main_tag in pending_reset_tag_values: LOGGER.debug( 'found begin prefix via preceding space, resetting path: %s', main_full_tag) writer.require_path([], token=token) elif sub_prefix == B_TAG_PREFIX: sub_parent_path = sub_required_path[:-1] LOGGER.debug( 'found begin sub prefix, resetting sub path to parent %s, for %s', sub_parent_path, sub_full_tag) writer.require_path_or_below(sub_parent_path, token=token) pending_reset_tag_values.clear() required_path: List[str] = (sub_required_path if sub_full_tag else main_required_path) if pending_space_tokens: for pending_space_token in pending_space_tokens: writer.require_path_or_below(required_path, token=pending_space_token) writer.append_text(pending_space_token.text) pending_space_tokens = [] writer.require_path(required_path, token=token) writer.append_text(token.text) for pending_space_token in pending_space_tokens: writer.require_path_or_below([], token=pending_space_token) writer.append_text(pending_space_token.text) pending_space_tokens = [] return parent