async def telegram_to_matrix( evt: Message, source: "AbstractUser", main_intent: Optional[IntentAPI] = None, prefix_text: Optional[str] = None, prefix_html: Optional[str] = None, override_text: str = None, override_entities: List[TypeMessageEntity] = None, no_reply_fallback: bool = False) -> TextMessageEventContent: content = TextMessageEventContent( msgtype=MessageType.TEXT, body=add_surrogate(override_text or evt.message), ) entities = override_entities or evt.entities if entities: content.format = Format.HTML content.formatted_body = _telegram_entities_to_matrix_catch( content.body, entities) if prefix_html: if not content.formatted_body: content.format = Format.HTML content.formatted_body = escape(content.body) content.formatted_body = prefix_html + content.formatted_body if prefix_text: content.body = prefix_text + content.body if evt.fwd_from: await _add_forward_header(source, content, evt.fwd_from) if evt.reply_to_msg_id and not no_reply_fallback: await _add_reply_header(source, content, evt, main_intent) if isinstance(evt, Message) and evt.post and evt.post_author: if not content.formatted_body: content.formatted_body = escape(content.body) content.body += f"\n- {evt.post_author}" content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>" content.body = del_surrogate(content.body) if content.formatted_body: content.formatted_body = del_surrogate( content.formatted_body.replace("\n", "<br/>")) return content
def unparse(text, entities, delimiters=None, url_fmt=None): """ Performs the reverse operation to .parse(), effectively returning markdown-like syntax given a normal text and its MessageEntity's. :param text: the text to be reconverted into markdown. :param entities: the MessageEntity's applied to the text. :return: a markdown-like text representing the combination of both inputs. """ if not text or not entities: return text if not delimiters: if delimiters is not None: return text delimiters = DEFAULT_DELIMITERS if url_fmt is not None: warnings.warn( 'url_fmt is deprecated') # since it complicates everything *a lot* if isinstance(entities, TLObject): entities = (entities, ) text = add_surrogate(text) delimiters = {v: k for k, v in delimiters.items()} insert_at = [] for entity in entities: s = entity.offset e = entity.offset + entity.length delimiter = delimiters.get(type(entity), None) if delimiter: insert_at.append((s, delimiter)) insert_at.append((e, delimiter)) else: url = None if isinstance(entity, MessageEntityTextUrl): url = entity.url elif isinstance(entity, MessageEntityMentionName): url = 'tg://user?id={}'.format(entity.user_id) if url: insert_at.append((s, '[')) insert_at.append((e, ']({})'.format(url))) insert_at.sort(key=lambda t: t[0]) while insert_at: at, what = insert_at.pop() # If we are in the middle of a surrogate nudge the position by +1. # Otherwise we would end up with malformed text and fail to encode. # For example of bad input: "Hi \ud83d\ude1c" # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF while at < len(text) and '\ud800' <= text[at] <= '\udfff': at += 1 text = text[:at] + what + text[at:] return del_surrogate(text)
def remove_code_and_mentions(message): content = list(add_surrogate(message.message)) slices = [] for ent, txt in message.get_entities_text(): if isinstance(ent, (types.MessageEntityCode, types.MessageEntityMention, types.MessageEntityMentionName)): slices.append(slice(ent.offset, ent.offset + ent.length)) for s in reversed(slices): del content[s] return del_surrogate(''.join(content))
async def _matrix_html_to_telegram( client: TelegramClient, html: str) -> tuple[str, list[TypeMessageEntity]]: try: html = command_regex.sub(r"<command>\1</command>", html) html = html.replace("\t", " " * 4) html = not_command_regex.sub(r"\1", html) parsed = await MatrixParser(client).parse(add_surrogate(html)) text = del_surrogate(parsed.text.strip()) text, entities = _cut_long_message(text, parsed.telegram_entities) return text, entities except Exception as e: raise FormatError(f"Failed to convert Matrix format: {html}") from e
def matrix_to_telegram(html: str) -> ParsedMessage: try: html = command_regex.sub(r"<command>\1</command>", html) html = html.replace("\t", " " * 4) html = not_command_regex.sub(r"\1", html) if should_bridge_plaintext_highlights: html = plain_mention_regex.sub(plain_mention_to_html, html) text, entities = parse_html(add_surrogate(html)) text = del_surrogate(text.strip()) text, entities = cut_long_message(text, entities) return text, entities except Exception as e: raise FormatError(f"Failed to convert Matrix format: {html}") from e
async def telegram_to_matrix( evt: Message | SponsoredMessage, source: au.AbstractUser, main_intent: IntentAPI | None = None, prefix_text: str | None = None, prefix_html: str | None = None, override_text: str = None, override_entities: list[TypeMessageEntity] = None, no_reply_fallback: bool = False, require_html: bool = False, ) -> TextMessageEventContent: content = TextMessageEventContent( msgtype=MessageType.TEXT, body=add_surrogate(override_text or evt.message), ) entities = override_entities or evt.entities if entities: content.format = Format.HTML html = await _telegram_entities_to_matrix_catch( add_surrogate(content.body), entities) content.formatted_body = del_surrogate(html) if require_html: content.ensure_has_html() if prefix_html: content.ensure_has_html() content.formatted_body = prefix_html + content.formatted_body if prefix_text: content.body = prefix_text + content.body if getattr(evt, "fwd_from", None): await _add_forward_header(source, content, evt.fwd_from) if getattr(evt, "reply_to", None) and not no_reply_fallback: await _add_reply_header(source, content, evt, main_intent) if isinstance(evt, Message) and evt.post and evt.post_author: content.ensure_has_html() content.body += f"\n- {evt.post_author}" content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>" return content
def parse(message, delimiters=None, url_re=None): """ Parses the given markdown message and returns its stripped representation plus a list of the MessageEntity's that were found. :param message: the message with markdown-like syntax to be parsed. :param delimiters: the delimiters to be used, {delimiter: type}. :param url_re: the URL bytes regex to be used. Must have two groups. :return: a tuple consisting of (clean message, [message entities]). """ if not message: return message, [] if url_re is None: url_re = DEFAULT_URL_RE elif isinstance(url_re, str): url_re = re.compile(url_re) if not delimiters: if delimiters is not None: return message, [] delimiters = DEFAULT_DELIMITERS # Build a regex to efficiently test all delimiters at once. # Note that the largest delimiter should go first, we don't # want ``` to be interpreted as a single back-tick in a code block. delim_re = re.compile('|'.join( '({})'.format(re.escape(k)) for k in sorted(delimiters, key=len, reverse=True))) # Cannot use a for loop because we need to skip some indices i = 0 result = [] # Work on byte level with the utf-16le encoding to get the offsets right. # The offset will just be half the index we're at. message = add_surrogate(message) while i < len(message): m = delim_re.match(message, pos=i) # Did we find some delimiter here at `i`? if m: delim = next(filter(None, m.groups())) # +1 to avoid matching right after (e.g. "****") end = message.find(delim, i + len(delim) + 1) # Did we find the earliest closing tag? if end != -1: # Remove the delimiter from the string message = ''.join((message[:i], message[i + len(delim):end], message[end + len(delim):])) # Check other affected entities for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > i: # If the old start is also before ours, it is fully enclosed if ent.offset <= i: ent.length -= len(delim) * 2 else: ent.length -= len(delim) # Append the found entity ent = delimiters[delim] if ent == MessageEntityPre: result.append(ent(i, end - i - len(delim), '')) # has 'lang' else: result.append(ent(i, end - i - len(delim))) # No nested entities inside code blocks if ent in (MessageEntityCode, MessageEntityPre): i = end - len(delim) continue elif url_re: m = url_re.match(message, pos=i) if m: # Replace the whole match with only the inline URL text. message = ''.join( (message[:m.start()], m.group(1), message[m.end():])) delim_size = m.end() - m.start() - len(m.group()) for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > m.start(): ent.length -= delim_size result.append( MessageEntityTextUrl(offset=m.start(), length=len(m.group(1)), url=del_surrogate(m.group(2)))) i += len(m.group(1)) continue i += 1 message = strip_text(message, result) return del_surrogate(message), result
def _process_text(self, params): if not self.source.text: return False append_from = self.fwd == FWD_APPEND fwd = None for att in reversed(self.attachments): if append_from and isinstance(att, Fwd): fwd = _type_in_list(reversed(self.attachments), Fwd) # Fwd.url is already resolved here self.source.entities.append( types.MessageEntityUrl( len(self.source.raw_text) + 2, len(fwd.url) - 2)) self.source.text += '\n\n' + fwd.url append_from = False continue if not isinstance(att, Url): continue if self.source.text == str(att.url): if att.title: params['message'] = att.title return True if fwd: self.attachments.remove(fwd) text_urls = [] for e, inner_text in self.source.get_entities_text(): # NOTE no MessageEntityMentionName usage examples/documentation available # so assume it is same as MessageEntityMention if isinstance( e, (types.MessageEntityMention, types.MessageEntityMentionName)): text_urls.append( types.MessageEntityTextUrl( e.offset, e.length, 'https://t.me/' + inner_text[1:])) continue if isinstance(e, types.MessageEntityTextUrl): text_urls.append(e) geo = _type_in_list(self.attachments, Geo) if geo: self.attachments.remove(geo) params['lat'] = geo.lat params['long'] = geo.long # if this is a rich text rich_page = _type_in_list(self.attachments, Page) if rich_page: params['message'] = rich_page.title return False if text_urls: # add_surrogate/del_surrogate are used by Telethon internally in # get_entities_text -> get_inner_text to get correct offsets in unicode raw_text = add_surrogate(self.source.raw_text) msg = [] prev = 0 for tu in text_urls: title = del_surrogate(raw_text[prev:(tu.offset + tu.length)]) # link titles to telegraph photos look like \u200b\u200b if _ZERO_CHARS.match(title): continue msg.append(title) msg.append(' (' + tu.url + ') ') prev = tu.offset + tu.length msg.append(del_surrogate(raw_text[prev:])) del raw_text params['message'] = ''.join(msg) else: params['message'] = self.source.raw_text return True
def _process_rich_text(self): min_length = config.getint('xpost', 'rich_text_min_length', fallback=256) is_rich = bool(self.source.entities) and \ bool(_type_in_list(self.source.entities, ( types.MessageEntityBold, types.MessageEntityItalic, types.MessageEntityPre, types.MessageEntityCode ))) and \ len(self.source.raw_text) >= min_length if not is_rich: return False min_title_length = config.getint('xpost', 'min_title_length', fallback=8) max_title_length = min_length // 4 pos = min(p for p in (self.source.raw_text.find( '\n', min_title_length, max_title_length), self.source.raw_text.find( '. ', min_title_length, max_title_length), self.source.raw_text.find( ', ', min_title_length, max_title_length), max_title_length - 3) if p != -1) title = self.source.raw_text[0:pos].strip() + '...' fmt_list = { types.MessageEntityBold: '<b>{0}</b>', types.MessageEntityItalic: '<i>{0}</i>', types.MessageEntityPre: '<pre>{0}</pre>', types.MessageEntityCode: '<code>{0}</code>', types.MessageEntityMention: '[https://t.me/{1}|{0}]', types.MessageEntityMentionName: '[https://t.me/{1}|{0}]', types.MessageEntityUrl: '[{0}]', types.MessageEntityTextUrl: '[{1}|{0}]' } # add_surrogate/del_surrogate are used by Telethon internally in # get_entities_text -> get_inner_text to get correct offsets in unicode raw_text = add_surrogate(self.source.raw_text) text = [] prev = 0 for e, et in self.source.get_entities_text(): text.append(del_surrogate(raw_text[prev:e.offset])) ev = None # NOTE no MessageEntityMentionName usage examples/documentation available # so assume it is same as MessageEntityMention if isinstance( e, (types.MessageEntityMention, types.MessageEntityMentionName)): ev = et[1:] elif isinstance(e, types.MessageEntityTextUrl): ev = e.url fmt = fmt_list.get(type(e), '{0}') text.append(del_surrogate(fmt.format(et, ev))) prev = e.offset + e.length text.append(del_surrogate(raw_text[prev:])) del raw_text self.attachments.append( Page(self.session, self.default_params, self.group_id, title, ''.join(text), self.attachments)) return True