def parse_subreddit(m): text = '/' + m.group(3) entity = MessageEntityTextUrl(offset=m.start(2), length=len(text), url=f'reddit.com{text}') return m.group(1) + text, entity
def parse_url_match(m): entity = MessageEntityTextUrl(offset=m.start(), length=len(m.group(1)), url=del_surrogate(m.group(2))) return m.group(1), entity
def parse_subreddit(m): text = "/" + m.group(3) entity = MessageEntityTextUrl( offset=m.start(2), length=len(text), url=f"https://reddit.com{text}" ) return m.group(1) + text, entity
def parse(message, delimiters=None, url_re=None): """ Parses the given markdown message and returns its stripped representation plus a list of the MessageEntity's that were found. :param message: the message with markdown-like syntax to be parsed. :param delimiters: the delimiters to be used, {delimiter: type}. :param url_re: the URL bytes regex to be used. Must have two groups. :return: a tuple consisting of (clean message, [message entities]). """ if not message: return message, [] if url_re is None: url_re = DEFAULT_URL_RE elif isinstance(url_re, str): url_re = re.compile(url_re) if not delimiters: if delimiters is not None: return message, [] delimiters = DEFAULT_DELIMITERS # Build a regex to efficiently test all delimiters at once. # Note that the largest delimiter should go first, we don't # want ``` to be interpreted as a single back-tick in a code block. delim_re = re.compile('|'.join( '({})'.format(re.escape(k)) for k in sorted(delimiters, key=len, reverse=True))) # Cannot use a for loop because we need to skip some indices i = 0 result = [] # Work on byte level with the utf-16le encoding to get the offsets right. # The offset will just be half the index we're at. message = add_surrogate(message) while i < len(message): m = delim_re.match(message, pos=i) # Did we find some delimiter here at `i`? if m: delim = next(filter(None, m.groups())) # +1 to avoid matching right after (e.g. "****") end = message.find(delim, i + len(delim) + 1) # Did we find the earliest closing tag? if end != -1: # Remove the delimiter from the string message = ''.join((message[:i], message[i + len(delim):end], message[end + len(delim):])) # Check other affected entities for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > i: # If the old start is also before ours, it is fully enclosed ent.length -= len( delim) * 2 if ent.offset <= i else len(delim) # Append the found entity ent = delimiters[delim] if ent == MessageEntityPre: result.append(ent(i, end - i - len(delim), '')) # has 'lang' else: result.append(ent(i, end - i - len(delim))) # No nested entities inside code blocks if ent in (MessageEntityCode, MessageEntityPre): i = end - len(delim) continue elif url_re: m = url_re.match(message, pos=i) if m: # Replace the whole match with only the inline URL text. message = ''.join( (message[:m.start()], m.group(1), message[m.end():])) delim_size = m.end() - m.start() - len(m.group()) for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > m.start(): ent.length -= delim_size result.append( MessageEntityTextUrl(offset=m.start(), length=len(m.group(1)), url=del_surrogate(m.group(2)))) i += len(m.group(1)) continue i += 1 message = strip_text(message, result) return del_surrogate(message), result
def parse(message, delimiters=None, url_re=None): """ Parses the given markdown message and returns its stripped representation plus a list of the MessageEntity's that were found. :param message: the message with markdown-like syntax to be parsed. :param delimiters: the delimiters to be used, {delimiter: type}. :param url_re: the URL bytes regex to be used. Must have two groups. :return: a tuple consisting of (clean message, [message entities]). """ if url_re is None: url_re = DEFAULT_URL_RE elif isinstance(url_re, str): url_re = re.compile(url_re) if not delimiters: if delimiters is not None: return message, [] delimiters = DEFAULT_DELIMITERS # Cannot use a for loop because we need to skip some indices i = 0 result = [] current = None end_delimiter = None # Work on byte level with the utf-16le encoding to get the offsets right. # The offset will just be half the index we're at. message = _add_surrogate(message) while i < len(message): if url_re and current is None: # If we're not inside a previous match since Telegram doesn't allow # nested message entities, try matching the URL from the i'th pos. url_match = url_re.match(message, pos=i) if url_match: # Replace the whole match with only the inline URL text. message = ''.join( (message[:url_match.start()], url_match.group(1), message[url_match.end():])) result.append( MessageEntityTextUrl(offset=url_match.start(), length=len(url_match.group(1)), url=_del_surrogate( url_match.group(2)))) i += len(url_match.group(1)) # Next loop iteration, don't check delimiters, since # a new inline URL might be right after this one. continue if end_delimiter is None: # We're not expecting any delimiter, so check them all for d, m in delimiters.items(): # Slice the string at the current i'th position to see if # it matches the current delimiter d, otherwise skip it. if message[i:i + len(d)] != d: continue if message[i + len(d):i + 2 * len(d)] == d: # The same delimiter can't be right afterwards, if # this were the case we would match empty strings # like `` which we don't want to. continue # Get rid of the delimiter by slicing it away message = message[:i] + message[i + len(d):] if m == MessageEntityPre: # Special case, also has 'lang' current = m(i, None, '') else: current = m(i, None) end_delimiter = d # We expect the same delimiter. break elif message[i:i + len(end_delimiter)] == end_delimiter: message = message[:i] + message[i + len(end_delimiter):] current.length = i - current.offset result.append(current) current, end_delimiter = None, None # Don't increment i here as we matched a delimiter, # and there may be a new one right after. This is # different than when encountering the first delimiter, # as we already know there won't be the same right after. continue # Next iteration i += 1 # We may have found some a delimiter but not its ending pair. # If this is the case, we want to insert the delimiter character back. if current is not None: message = (message[:current.offset] + end_delimiter + message[current.offset:]) return _del_surrogate(message), result
def parse_message_entities(msg): """Parses a message and returns the parsed message and the entities (bold, italic...). Note that although markdown-like syntax is used, this does not reflect the complete specification!""" # Store the entities here entities = [] # Convert the message to a mutable list msg = list(msg) # First, let's handle all the text links in the message, so afterwards it's clean # for us to get our hands dirty with the other indicators (bold, italic and fixed) url_indices = [None] * 4 # start/end text index, start/end url index valid_url_indices = [] # all the valid url_indices found for i, c in enumerate(msg): if c is '[': url_indices[0] = i # From now on, also ensure that the last item was set elif c == ']' and url_indices[0]: url_indices[1] = i elif c == '(' and url_indices[1]: # If the previous index (']') is not exactly before the current index ('('), # then it's not a valid text link, so clear the previous state if url_indices[1] != i - 1: url_indices[:2] = [None] * 2 else: url_indices[2] = i elif c == ')' and url_indices[2]: # We have succeeded to find a markdown-like text link! url_indices[3] = i valid_url_indices.append(url_indices[:]) # Append a copy url_indices = [None] * 4 # Iterate in reverse order to clean the text from the urls # (not to affect previous indices) and append MessageEntityTextUrl's for i in range(len(valid_url_indices) - 1, -1, -1): vui = valid_url_indices[i] # Add 1 when slicing the message not to include the [] nor () # There is no need to subtract 1 on the later part because that index is already excluded link_text = ''.join(msg[vui[0] + 1:vui[1]]) link_url = ''.join(msg[vui[2] + 1:vui[3]]) # After we have retrieved both the link text and url, replace them in the message # Now we do have to add 1 to include the [] and () when deleting and replacing! del msg[vui[2]:vui[3] + 1] msg[vui[0]:vui[1] + 1] = link_text # Finally, update the current valid index url to reflect that all the previous VUI's will be removed # This is because, after the previous VUI's get done, their part of the message is removed too, # hence we need to update the current VUI subtracting that removed part length for prev_vui in valid_url_indices[:i]: prev_vui_length = prev_vui[3] - prev_vui[2] - 1 displacement = prev_vui_length + len('[]()') vui[0] -= displacement vui[1] -= displacement # No need to subtract the displacement from the URL part (indices 2 and 3) # When calculating the length, subtract 1 again not to include the previously called ']' entities.append( MessageEntityTextUrl(offset=vui[0], length=vui[1] - vui[0] - 1, url=link_url)) # After the message is clean from links, handle all the indicator flags indicator_flags = {'*': None, '_': None, '`': None} # Iterate over the list to find the indicators of entities for i, c in enumerate(msg): # Only perform further check if the current character is an indicator if c in indicator_flags: # If it is the first time we find this indicator, update its index if indicator_flags[c] is None: indicator_flags[c] = i # Otherwise, it means that we found it before. Hence, the message entity *is* complete else: # Then we have found a new whole valid entity offset = indicator_flags[c] length = i - offset - 1 # Subtract -1 not to include the indicator itself # Add the corresponding entity if c == '*': entities.append( MessageEntityBold(offset=offset, length=length)) elif c == '_': entities.append( MessageEntityItalic(offset=offset, length=length)) elif c == '`': entities.append( MessageEntityCode(offset=offset, length=length)) # Clear the flag to start over with this indicator indicator_flags[c] = None # Sort the entities by their offset first entities = sorted(entities, key=lambda e: e.offset) # Now that all the entities have been found and sorted, remove # their indicators from the message and update the offsets for entity in entities: if type(entity) is not MessageEntityTextUrl: # Clean the message from the current entity's indicators del msg[entity.offset + entity.length + 1] del msg[entity.offset] # Iterate over all the entities but the current for subentity in [e for e in entities if e is not entity]: # First case, one in one out: so*me_th_in*g. # In this case, the current entity length is decreased by two, # and all the subentities offset decreases 1 if (subentity.offset > entity.offset and subentity.offset + subentity.length < entity.offset + entity.length): entity.length -= 2 subentity.offset -= 1 # Second case, both inside: so*me_th*in_g. # In this case, the current entity length is decreased by one, # and all the subentities offset and length decrease 1 elif (entity.offset < subentity.offset < entity.offset + entity.length and subentity.offset + subentity.length > entity.offset + entity.length): entity.length -= 1 subentity.offset -= 1 subentity.length -= 1 # Third case, both outside: so*me*th_in_g. # In this case, the current entity is left untouched, # and all the subentities offset decreases 2 elif subentity.offset > entity.offset + entity.length: subentity.offset -= 2 # Finally, we can join our poor mutilated message back and return msg = ''.join(msg) return msg, entities