Example #1
0
def parse_subreddit(m):
    text = '/' + m.group(3)
    entity = MessageEntityTextUrl(offset=m.start(2),
                                  length=len(text),
                                  url=f'reddit.com{text}')
    return m.group(1) + text, entity
Example #2
0
def parse_url_match(m):
    entity = MessageEntityTextUrl(offset=m.start(),
                                  length=len(m.group(1)),
                                  url=del_surrogate(m.group(2)))
    return m.group(1), entity
Example #3
0
def parse_subreddit(m):
    text = "/" + m.group(3)
    entity = MessageEntityTextUrl(
        offset=m.start(2), length=len(text), url=f"https://reddit.com{text}"
    )
    return m.group(1) + text, entity
Example #4
0
def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.
    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not message:
        return message, []

    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Build a regex to efficiently test all delimiters at once.
    # Note that the largest delimiter should go first, we don't
    # want ``` to be interpreted as a single back-tick in a code block.
    delim_re = re.compile('|'.join(
        '({})'.format(re.escape(k))
        for k in sorted(delimiters, key=len, reverse=True)))

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = add_surrogate(message)
    while i < len(message):
        m = delim_re.match(message, pos=i)

        # Did we find some delimiter here at `i`?
        if m:
            delim = next(filter(None, m.groups()))

            # +1 to avoid matching right after (e.g. "****")
            end = message.find(delim, i + len(delim) + 1)

            # Did we find the earliest closing tag?
            if end != -1:

                # Remove the delimiter from the string
                message = ''.join((message[:i], message[i + len(delim):end],
                                   message[end + len(delim):]))

                # Check other affected entities
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > i:
                        # If the old start is also before ours, it is fully enclosed
                        ent.length -= len(
                            delim) * 2 if ent.offset <= i else len(delim)
                # Append the found entity
                ent = delimiters[delim]
                if ent == MessageEntityPre:
                    result.append(ent(i, end - i - len(delim),
                                      ''))  # has 'lang'
                else:
                    result.append(ent(i, end - i - len(delim)))

                # No nested entities inside code blocks
                if ent in (MessageEntityCode, MessageEntityPre):
                    i = end - len(delim)

                continue

        elif url_re:
            m = url_re.match(message, pos=i)
            if m:
                # Replace the whole match with only the inline URL text.
                message = ''.join(
                    (message[:m.start()], m.group(1), message[m.end():]))

                delim_size = m.end() - m.start() - len(m.group())
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > m.start():
                        ent.length -= delim_size

                result.append(
                    MessageEntityTextUrl(offset=m.start(),
                                         length=len(m.group(1)),
                                         url=del_surrogate(m.group(2))))
                i += len(m.group(1))
                continue

        i += 1

    message = strip_text(message, result)
    return del_surrogate(message), result
Example #5
0
def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.
    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []
    current = None
    end_delimiter = None

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = _add_surrogate(message)
    while i < len(message):
        if url_re and current is None:
            # If we're not inside a previous match since Telegram doesn't allow
            # nested message entities, try matching the URL from the i'th pos.
            url_match = url_re.match(message, pos=i)
            if url_match:
                # Replace the whole match with only the inline URL text.
                message = ''.join(
                    (message[:url_match.start()], url_match.group(1),
                     message[url_match.end():]))

                result.append(
                    MessageEntityTextUrl(offset=url_match.start(),
                                         length=len(url_match.group(1)),
                                         url=_del_surrogate(
                                             url_match.group(2))))
                i += len(url_match.group(1))
                # Next loop iteration, don't check delimiters, since
                # a new inline URL might be right after this one.
                continue

        if end_delimiter is None:
            # We're not expecting any delimiter, so check them all
            for d, m in delimiters.items():
                # Slice the string at the current i'th position to see if
                # it matches the current delimiter d, otherwise skip it.
                if message[i:i + len(d)] != d:
                    continue

                if message[i + len(d):i + 2 * len(d)] == d:
                    # The same delimiter can't be right afterwards, if
                    # this were the case we would match empty strings
                    # like `` which we don't want to.
                    continue

                # Get rid of the delimiter by slicing it away
                message = message[:i] + message[i + len(d):]
                if m == MessageEntityPre:
                    # Special case, also has 'lang'
                    current = m(i, None, '')
                else:
                    current = m(i, None)

                end_delimiter = d  # We expect the same delimiter.
                break

        elif message[i:i + len(end_delimiter)] == end_delimiter:
            message = message[:i] + message[i + len(end_delimiter):]
            current.length = i - current.offset
            result.append(current)
            current, end_delimiter = None, None
            # Don't increment i here as we matched a delimiter,
            # and there may be a new one right after. This is
            # different than when encountering the first delimiter,
            # as we already know there won't be the same right after.
            continue

        # Next iteration
        i += 1

    # We may have found some a delimiter but not its ending pair.
    # If this is the case, we want to insert the delimiter character back.
    if current is not None:
        message = (message[:current.offset] + end_delimiter +
                   message[current.offset:])

    return _del_surrogate(message), result
Example #6
0
def parse_message_entities(msg):
    """Parses a message and returns the parsed message and the entities (bold, italic...).
       Note that although markdown-like syntax is used, this does not reflect the complete specification!"""

    # Store the entities here
    entities = []

    # Convert the message to a mutable list
    msg = list(msg)

    # First, let's handle all the text links in the message, so afterwards it's clean
    # for us to get our hands dirty with the other indicators (bold, italic and fixed)
    url_indices = [None] * 4  # start/end text index, start/end url index
    valid_url_indices = []  # all the valid url_indices found
    for i, c in enumerate(msg):
        if c is '[':
            url_indices[0] = i

        # From now on, also ensure that the last item was set
        elif c == ']' and url_indices[0]:
            url_indices[1] = i

        elif c == '(' and url_indices[1]:
            # If the previous index (']') is not exactly before the current index ('('),
            # then it's not a valid text link, so clear the previous state
            if url_indices[1] != i - 1:
                url_indices[:2] = [None] * 2
            else:
                url_indices[2] = i

        elif c == ')' and url_indices[2]:
            # We have succeeded to find a markdown-like text link!
            url_indices[3] = i
            valid_url_indices.append(url_indices[:])  # Append a copy
            url_indices = [None] * 4

    # Iterate in reverse order to clean the text from the urls
    # (not to affect previous indices) and append MessageEntityTextUrl's
    for i in range(len(valid_url_indices) - 1, -1, -1):
        vui = valid_url_indices[i]

        # Add 1 when slicing the message not to include the [] nor ()
        # There is no need to subtract 1 on the later part because that index is already excluded
        link_text = ''.join(msg[vui[0] + 1:vui[1]])
        link_url = ''.join(msg[vui[2] + 1:vui[3]])

        # After we have retrieved both the link text and url, replace them in the message
        # Now we do have to add 1 to include the [] and () when deleting and replacing!
        del msg[vui[2]:vui[3] + 1]
        msg[vui[0]:vui[1] + 1] = link_text

        # Finally, update the current valid index url to reflect that all the previous VUI's will be removed
        # This is because, after the previous VUI's get done, their part of the message is removed too,
        # hence we need to update the current VUI subtracting that removed part length
        for prev_vui in valid_url_indices[:i]:
            prev_vui_length = prev_vui[3] - prev_vui[2] - 1
            displacement = prev_vui_length + len('[]()')
            vui[0] -= displacement
            vui[1] -= displacement
            # No need to subtract the displacement from the URL part (indices 2 and 3)

        # When calculating the length, subtract 1 again not to include the previously called ']'
        entities.append(
            MessageEntityTextUrl(offset=vui[0],
                                 length=vui[1] - vui[0] - 1,
                                 url=link_url))

    # After the message is clean from links, handle all the indicator flags
    indicator_flags = {'*': None, '_': None, '`': None}

    # Iterate over the list to find the indicators of entities
    for i, c in enumerate(msg):
        # Only perform further check if the current character is an indicator
        if c in indicator_flags:
            # If it is the first time we find this indicator, update its index
            if indicator_flags[c] is None:
                indicator_flags[c] = i

            # Otherwise, it means that we found it before. Hence, the message entity *is* complete
            else:
                # Then we have found a new whole valid entity
                offset = indicator_flags[c]
                length = i - offset - 1  # Subtract -1 not to include the indicator itself

                # Add the corresponding entity
                if c == '*':
                    entities.append(
                        MessageEntityBold(offset=offset, length=length))

                elif c == '_':
                    entities.append(
                        MessageEntityItalic(offset=offset, length=length))

                elif c == '`':
                    entities.append(
                        MessageEntityCode(offset=offset, length=length))

                # Clear the flag to start over with this indicator
                indicator_flags[c] = None

    # Sort the entities by their offset first
    entities = sorted(entities, key=lambda e: e.offset)

    # Now that all the entities have been found and sorted, remove
    # their indicators from the message and update the offsets
    for entity in entities:
        if type(entity) is not MessageEntityTextUrl:
            # Clean the message from the current entity's indicators
            del msg[entity.offset + entity.length + 1]
            del msg[entity.offset]

            # Iterate over all the entities but the current
            for subentity in [e for e in entities if e is not entity]:
                # First case, one in one out: so*me_th_in*g.
                # In this case, the current entity length is decreased by two,
                # and all the subentities offset decreases 1
                if (subentity.offset > entity.offset
                        and subentity.offset + subentity.length <
                        entity.offset + entity.length):
                    entity.length -= 2
                    subentity.offset -= 1

                # Second case, both inside: so*me_th*in_g.
                # In this case, the current entity length is decreased by one,
                # and all the subentities offset and length decrease 1
                elif (entity.offset < subentity.offset <
                      entity.offset + entity.length
                      and subentity.offset + subentity.length >
                      entity.offset + entity.length):
                    entity.length -= 1
                    subentity.offset -= 1
                    subentity.length -= 1

                # Third case, both outside: so*me*th_in_g.
                # In this case, the current entity is left untouched,
                # and all the subentities offset decreases 2
                elif subentity.offset > entity.offset + entity.length:
                    subentity.offset -= 2

    # Finally, we can join our poor mutilated message back and return
    msg = ''.join(msg)
    return msg, entities