Ejemplo n.º 1
0
def test_strip_text():
    """
    Remove test text.

    Args:
    """
    assert helpers.strip_text(" text ", []) == "text"
Ejemplo n.º 2
0
def moothrix_parse(html):
    if not html:
        return html, []

    parser = MethrixParser()
    parser.feed(_add_surrogate(html))
    text = helpers.strip_text(parser.text, parser.entities)
    return _del_surrogate(text), parser.entities
Ejemplo n.º 3
0
def text_and_format_entities_split(plain_text: str,
                                   format_entities: Sequence[TypeMessageEntity],
                                   length_limit_head: int = 4096,
                                   head_count: int = -1,
                                   length_limit_tail: int = 4096) \
        -> list[tuple[str, list[TypeMessageEntity]]]:
    format_entities = merge_contiguous_entities(copy_entities(format_entities))  # sort and merge

    chunks = []

    pending_text = plain_text
    pending_entities = format_entities[:]
    surrogate_len_sum = 0
    while pending_text:
        curr_length_limit = length_limit_head if head_count <= -1 or len(chunks) < head_count else length_limit_tail
        curr_length_limit = min(curr_length_limit, len(pending_text))
        # note: Telegram only allows up to 10000-Byte formatting entities per message
        # here the limit is set to 9500 Bytes to avoid possible problems
        if (len(pending_text) == curr_length_limit
                and not (len(pending_entities) > 100 or len(b''.join(x._bytes() for x in pending_entities)) >= 9500)):
            if surrogate_len_sum > 0:
                for entity in pending_entities:
                    entity.offset -= surrogate_len_sum
            chunks.append((pending_text, pending_entities))
            break
        for curr_length_limit in range(curr_length_limit, 0, -100):
            try:
                for sep in ('\n', '。', '. ', ';', '; ', ',', ', ', '?', '? ', '!', '! ', ':', ': ', '\t',
                            ' ', '\xa0', ''):
                    sep_pos = pending_text.rfind(sep, int(curr_length_limit * 0.5), curr_length_limit)
                    if sep_pos != -1:
                        curr_text = pending_text[:sep_pos + len(sep)]
                        surrogate_end_pos = surrogate_len_sum + surrogate_len(curr_text)
                        _curr_entities = filter_entities_by_range(surrogate_len_sum, surrogate_end_pos,
                                                                  pending_entities)
                        if len(_curr_entities) > 100 or len(b''.join(x._bytes() for x in _curr_entities)) >= 9500:
                            raise OverflowError('Too many entities')
                        curr_entities, pending_entities = split_entities(surrogate_end_pos, pending_entities)
                        if surrogate_len_sum > 0:
                            for entity in curr_entities:
                                entity.offset -= surrogate_len_sum
                        surrogate_len_sum = surrogate_end_pos
                        chunks.append((curr_text, curr_entities))
                        pending_text = pending_text[sep_pos + len(sep):]
                        break
                break
            except OverflowError:
                pass

    stripped_chunks = []
    for text, entity in chunks:
        text = strip_text(text, entity)
        stripped_chunks.append((text, entity))

    return stripped_chunks
Ejemplo n.º 4
0
def test_strip_text():
    assert helpers.strip_text(" text ", []) == "text"
Ejemplo n.º 5
0
def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.
    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not message:
        return message, []

    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Build a regex to efficiently test all delimiters at once.
    # Note that the largest delimiter should go first, we don't
    # want ``` to be interpreted as a single back-tick in a code block.
    delim_re = re.compile('|'.join(
        '({})'.format(re.escape(k))
        for k in sorted(delimiters, key=len, reverse=True)))

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = add_surrogate(message)
    while i < len(message):
        m = delim_re.match(message, pos=i)

        # Did we find some delimiter here at `i`?
        if m:
            delim = next(filter(None, m.groups()))

            # +1 to avoid matching right after (e.g. "****")
            end = message.find(delim, i + len(delim) + 1)

            # Did we find the earliest closing tag?
            if end != -1:

                # Remove the delimiter from the string
                message = ''.join((message[:i], message[i + len(delim):end],
                                   message[end + len(delim):]))

                # Check other affected entities
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > i:
                        # If the old start is also before ours, it is fully enclosed
                        if ent.offset <= i:
                            ent.length -= len(delim) * 2
                        else:
                            ent.length -= len(delim)

                # Append the found entity
                ent = delimiters[delim]
                if ent == MessageEntityPre:
                    result.append(ent(i, end - i - len(delim),
                                      ''))  # has 'lang'
                else:
                    result.append(ent(i, end - i - len(delim)))

                # No nested entities inside code blocks
                if ent in (MessageEntityCode, MessageEntityPre):
                    i = end - len(delim)

                continue

        elif url_re:
            m = url_re.match(message, pos=i)
            if m:
                # Replace the whole match with only the inline URL text.
                message = ''.join(
                    (message[:m.start()], m.group(1), message[m.end():]))

                delim_size = m.end() - m.start() - len(m.group())
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > m.start():
                        ent.length -= delim_size

                result.append(
                    MessageEntityTextUrl(offset=m.start(),
                                         length=len(m.group(1)),
                                         url=del_surrogate(m.group(2))))
                i += len(m.group(1))
                continue

        i += 1

    message = strip_text(message, result)
    return del_surrogate(message), result