def get_merged_begin_inside_tags_of_same_tag_value(
        tags: Optional[List[Optional[str]]]) -> List[Optional[str]]:
    if not tags:
        return []
    prefix, tag_value = split_tag_prefix(tags[0])
    if not prefix:
        return tags
    return (tags[:1] + [add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)] *
            (len(tags) - 1))
def to_begin_inside_tags(tag: Optional[str],
                         length: int) -> List[Optional[str]]:
    if not length:
        return []
    prefix, tag_value = split_tag_prefix(tag)
    if not prefix:
        return [tag] * length
    return ([add_tag_prefix(tag_value, prefix=B_TAG_PREFIX)] +
            [add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)] * (length - 1))
def get_suffix_extended_token_tags(
        token_tags: List[str],
        token_texts: List[str],
        enabled_tags: Set[str],
        token_whitespaces: Optional[List[str]] = None) -> List[Optional[str]]:
    result: List[Optional[str]] = []
    if token_whitespaces is None:
        token_whitespaces = [' '] * len(token_texts)
    grouped_token_tags: List[List[Tuple[str, str, Optional[str]]]] = [
        list(group)
        for _, group in groupby(
            zip(token_tags, token_texts, token_whitespaces),
            key=lambda pair: strip_tag_prefix(pair[0])
        )
    ]
    LOGGER.debug('suffix grouped_token_tags=%s', grouped_token_tags)
    for index, group in enumerate(grouped_token_tags):
        LOGGER.debug('suffix group: unpacked=%s', group)
        group_tags: List[str]
        group_texts: List[str]
        group_whitespaces: Optional[List[str]]
        group_tags, group_texts, group_whitespaces = zip(*group)  # type: ignore
        LOGGER.debug(
            'suffix group: tags=%s, texts=%s, whitespace=%s',
            group_tags, group_texts, group_whitespaces
        )
        first_group_tag = group_tags[0]

        prev_group = grouped_token_tags[index - 1] if index > 0 else None
        first_prev_tag: Optional[str] = get_safe(get_safe(prev_group, 0), 0)
        _, first_prev_tag_value = split_tag_prefix(first_prev_tag)

        if first_group_tag or first_prev_tag_value not in enabled_tags:
            result.extend(group_tags)
            continue
        joined_text = JoinedText(group_texts, sep=' ', whitespace_list=group_whitespaces)
        m = re.search(r'^\.', str(joined_text))
        LOGGER.debug('suffix match: %s', m)
        if not m:
            result.extend(group_tags)
            continue
        LOGGER.debug('suffix match end: %s (%r)', m.end(), str(joined_text)[:m.end()])
        matching_tokens = list(joined_text.iter_items_and_index_range_between(
            (0, m.end())
        ))
        LOGGER.debug('suffix matching_tokens: %s', matching_tokens)
        if not matching_tokens:
            result.extend(group_tags)
            continue
        unmatched_token_count = len(group_tags) - len(matching_tokens)
        result.extend([to_inside_tag(first_prev_tag)] * len(matching_tokens))
        result.extend([None] * unmatched_token_count)
    LOGGER.debug('suffix result: %s', result)
    return result
Beispiel #4
0
 def _preserve_current_tags(self):
     rev_tag_to_tei_path_mapping = {
         v: k
         for k, v in self._tag_to_tei_path_mapping.items()
     }
     LOGGER.debug(
         'preserving tei tags using rev_tag_to_tei_path_mapping: %s',
         rev_tag_to_tei_path_mapping)
     for line in self._lines:
         for token in line.tokens:
             for level in (None, SUB_LEVEL):
                 full_existing_tag = self.get_tag(token, level=level)
                 prefix, existing_tag = split_tag_prefix(full_existing_tag)
                 mapped_tag = add_tag_prefix(
                     rev_tag_to_tei_path_mapping.get(
                         existing_tag, existing_tag),
                     prefix=prefix)
                 self._set_preserved_tag(token, mapped_tag, level=level)
def _iter_group_tokens_by_tag_entity(
        structured_document: AbstractStructuredDocument,
        tokens: Iterable[T]) -> Iterable[Tuple[Optional[str], List[T]]]:
    pending_tag_value = None
    pending_tokens = None
    for token in tokens:
        current_full_tag = structured_document.get_tag(token)
        current_tag_prefix, current_tag_value = split_tag_prefix(current_full_tag)
        if (
            pending_tokens
            and (
                pending_tag_value != current_tag_value
                or current_tag_prefix == B_TAG_PREFIX
            )
        ):
            yield pending_tag_value, pending_tokens
            pending_tokens = None
        if not pending_tokens:
            pending_tag_value = current_tag_value
            pending_tokens = [token]
            continue
        pending_tokens.append(token)
    if pending_tokens:
        yield pending_tag_value, pending_tokens
def _map_tag(tag: str, tag_map: Dict[str, str]) -> str:
    prefix, tag_value = split_tag_prefix(tag)
    return add_tag_prefix(
        tag=tag_map.get(tag_value, tag_value) if tag_value else tag_value,
        prefix=prefix
    )
def get_prefix_extended_token_tags(
        token_tags: List[str],
        token_texts: List[str],
        prefix_regex_by_tag_map: Dict[str, str],
        token_whitespaces: List[str] = None,
        enabled_tags: Set[str] = None) -> List[Optional[str]]:
    result: List[Optional[str]] = []
    if token_whitespaces is None:
        token_whitespaces = [' '] * len(token_texts)
    _enabled_tags = (
        enabled_tags
        if enabled_tags is not None
        else prefix_regex_by_tag_map.keys()
    )
    grouped_token_tags: List[List[Tuple[Optional[str], str, Optional[str]]]] = [
        list(group)
        for _, group in groupby(
            zip(token_tags, token_texts, token_whitespaces),
            key=lambda pair: strip_tag_prefix(pair[0])
        )
    ]
    LOGGER.debug('grouped_token_tags=%s', grouped_token_tags)
    for index, group in enumerate(grouped_token_tags):
        LOGGER.debug('group: unpacked=%s', group)
        group_tags: List[str]
        group_texts: List[str]
        group_whitespaces: Optional[List[str]]
        group_tags, group_texts, group_whitespaces = zip(*group)  # type: ignore
        LOGGER.debug(
            'group: tags=%s, texts=%s, whitespace=%s',
            group_tags, group_texts, group_whitespaces
        )
        first_group_tag = group_tags[0]
        next_group = grouped_token_tags[index + 1] if index + 1 < len(grouped_token_tags) else None
        first_next_tag = get_safe(get_safe(next_group, 0), 0)
        first_next_prefix, first_next_tag_value = split_tag_prefix(first_next_tag)
        if first_group_tag or first_next_tag_value not in _enabled_tags:
            result.extend(group_tags)
            continue
        assert first_next_tag_value is not None
        joined_text = JoinedText(group_texts, sep=' ', whitespace_list=group_whitespaces)
        prefix_regex = prefix_regex_by_tag_map[first_next_tag_value]
        m = re.search(prefix_regex, str(joined_text))
        LOGGER.debug('m: %s', m)
        if not m:
            result.extend(group_tags)
            continue
        LOGGER.debug('start: %s (%r)', m.start(), str(joined_text)[m.start():])
        matching_tokens = list(joined_text.iter_items_and_index_range_between(
            (m.start(), len(str(joined_text)))
        ))
        LOGGER.debug('matching_tokens: %s', matching_tokens)
        if not matching_tokens:
            result.extend(group_tags)
            continue
        unmatched_token_count = len(group_tags) - len(matching_tokens)
        result.extend([None] * unmatched_token_count)
        result.extend([first_next_tag])
        result.extend([to_inside_tag(first_next_tag)] * (len(matching_tokens) - 1))
        if first_next_prefix == B_TAG_PREFIX:
            assert next_group is not None
            next_group[0] = (
                to_inside_tag(first_next_tag),
                *next_group[0][1:]
            )
    LOGGER.debug('result: %s', result)
    return result
def get_extended_line_token_tags(
    line_token_tags: Sequence[Optional[str]],
    extend_to_line_enabled_map: Dict[str, bool] = None,
    merge_enabled_map: Dict[str, bool] = None,
    default_extend_to_line_enabled: bool = DEFAULT_EXTEND_TO_LINE_ENABLED,
    default_merge_enabled: bool = DEFAULT_MERGE_ENABLED
) -> List[Optional[str]]:
    if extend_to_line_enabled_map is None:
        extend_to_line_enabled_map = {}
    if merge_enabled_map is None:
        merge_enabled_map = {}
    LOGGER.debug(
        'line_token_tags: %s (extend_to_line_enabled_map: %s, merge_enabled_map: %s)',
        line_token_tags, extend_to_line_enabled_map, merge_enabled_map)
    grouped_token_tags: List[List[Optional[str]]] = [
        list(group)
        for _, group in groupby(line_token_tags, key=strip_tag_prefix)
    ]
    grouped_token_tags = [
        cast(List[Optional[str]],
             (get_merged_begin_inside_tags_of_same_tag_value(group)
              if merge_enabled_map.get(strip_tag_prefix(group[0]),
                                       default_merge_enabled) else group))
        for group in grouped_token_tags
    ]
    LOGGER.debug('grouped_token_tags: %s', grouped_token_tags)
    result: List[Optional[str]] = []
    for index, group in enumerate(grouped_token_tags):
        prev_group = grouped_token_tags[index - 1] if index > 0 else None
        next_group = grouped_token_tags[
            index + 1] if index + 1 < len(grouped_token_tags) else None
        _, last_prev_tag_value = split_tag_prefix(get_safe(prev_group, -1))
        first_next_prefix, first_next_tag_value = split_tag_prefix(
            get_safe(next_group, 0))
        LOGGER.debug('group: %s', group)
        if group[0]:
            result.extend(group)
        elif prev_group and next_group:
            if (last_prev_tag_value == first_next_tag_value
                    and get_dict_safe(merge_enabled_map, last_prev_tag_value,
                                      default_merge_enabled)):
                result.extend([to_inside_tag(prev_group[-1])] * len(group))
                if first_next_prefix == B_TAG_PREFIX:
                    next_group[0] = to_inside_tag(next_group[0])
            else:
                result.extend(group)
        elif (prev_group and not get_dict_safe(
                extend_to_line_enabled_map, last_prev_tag_value,
                default_extend_to_line_enabled)):
            result.extend(group)
        elif (next_group and not get_dict_safe(
                extend_to_line_enabled_map, first_next_tag_value,
                default_extend_to_line_enabled)):
            result.extend(group)
        elif prev_group and len(prev_group) > len(group):
            result.extend([to_inside_tag(prev_group[-1])] * len(group))
        elif next_group and len(next_group) > len(group):
            result.extend(to_begin_inside_tags(next_group[0], len(group)))
            if first_next_prefix == B_TAG_PREFIX:
                next_group[0] = to_inside_tag(next_group[0])
        else:
            result.extend(group)
    LOGGER.debug('result: %s', result)
    return result
def to_inside_tag(tag: Optional[str]) -> Optional[str]:
    prefix, tag_value = split_tag_prefix(tag)
    return (add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)
            if prefix == B_TAG_PREFIX else tag)
def to_begin_tag(tag: str) -> str:
    prefix, tag_value = split_tag_prefix(tag)
    return (add_tag_prefix(tag_value, prefix=B_TAG_PREFIX)
            if prefix == I_TAG_PREFIX else tag)
Beispiel #11
0
def _lines_to_tei(parent: etree.Element,
                  lines: List[TeiLine],
                  tag_to_tei_path_mapping: Dict[str, str] = None):
    if tag_to_tei_path_mapping is None:
        tag_to_tei_path_mapping = {}
    writer = XmlTreeWriter(parent)
    pending_space_tokens = []
    pending_reset_tag_values = set()
    for line_index, line in enumerate(lines):
        if line_index:
            writer.append(E(TeiTagNames.LB))
        for token in line.tokens:
            main_full_tag = token.attrib.get(TAG_ATTRIB_NAME)
            if not main_full_tag:
                main_full_tag = token.attrib.get(PRESERVED_TAG_ATTRIB_NAME)
            sub_full_tag = token.attrib.get(SUB_TAG_ATTRIB_NAME)
            if not sub_full_tag:
                sub_full_tag = token.attrib.get(PRESERVED_SUB_TAG_ATTRIB_NAME)
            main_prefix, main_tag = split_tag_prefix(main_full_tag)
            sub_prefix, sub_tag = split_tag_prefix(sub_full_tag)
            if not token.stripped_text:
                if main_prefix == B_TAG_PREFIX:
                    LOGGER.debug('adding to pending reset tags, for %s', token)
                    pending_reset_tag_values.add(main_tag)
                pending_space_tokens.append(token)
                continue
            main_required_path = _get_tag_required_path(
                main_tag, tag_to_tei_path_mapping)
            sub_required_path = (_get_tag_required_path(
                sub_tag, tag_to_tei_path_mapping) if sub_full_tag else [])
            if sub_full_tag and not _path_starts_with(main_required_path,
                                                      sub_required_path):
                LOGGER.debug('ignoring sub tag outside main path: %s (%s)',
                             sub_tag, sub_required_path)
                sub_tag = None
                sub_full_tag = None
                sub_required_path = []
            LOGGER.debug(
                'output token: %s (main_required_path: %s, sub_required_path: %s)',
                token, main_required_path, sub_required_path)

            if main_prefix == B_TAG_PREFIX:
                main_parent_path = main_required_path[:-1]
                LOGGER.debug(
                    'found begin prefix, resetting path to parent %s, for %s',
                    main_parent_path, main_full_tag)
                writer.require_path(main_parent_path, token=token)
            elif main_tag in pending_reset_tag_values:
                LOGGER.debug(
                    'found begin prefix via preceding space, resetting path: %s',
                    main_full_tag)
                writer.require_path([], token=token)
            elif sub_prefix == B_TAG_PREFIX:
                sub_parent_path = sub_required_path[:-1]
                LOGGER.debug(
                    'found begin sub prefix, resetting sub path to parent %s, for %s',
                    sub_parent_path, sub_full_tag)
                writer.require_path_or_below(sub_parent_path, token=token)

            pending_reset_tag_values.clear()

            required_path: List[str] = (sub_required_path if sub_full_tag else
                                        main_required_path)

            if pending_space_tokens:
                for pending_space_token in pending_space_tokens:
                    writer.require_path_or_below(required_path,
                                                 token=pending_space_token)
                    writer.append_text(pending_space_token.text)
                    pending_space_tokens = []

            writer.require_path(required_path, token=token)
            writer.append_text(token.text)

    for pending_space_token in pending_space_tokens:
        writer.require_path_or_below([], token=pending_space_token)
        writer.append_text(pending_space_token.text)
        pending_space_tokens = []

    return parent