def get_parser(): markdown_link = r'(?<!\\)\[(?P<link>.+?)\]\((?P<url>.+?)\)' newline = r'\n|\r\n' url_proto_regex = re.compile(r'(?i)^[a-z][\w-]+:/{1,3}') def url_complete(url): """If URL doesn't start with protocol, prepend it with http://""" return url if url_proto_regex.search(url) else 'http://' + url tokens = [ MarkdownTag(r'\*\*\*', is_bold=True, is_italic=True), MarkdownTag(r'___', is_bold=True, is_italic=True), MarkdownTag(r'\*\*', is_bold=True), MarkdownTag(r'__', is_bold=True), MarkdownTag(r'\*', is_italic=True), MarkdownTag(r'_', is_italic=True), MarkdownTag(r'```', skip=True), MarkdownTag(r'``', skip=True), MarkdownTag(r'`', skip=True), MarkdownTag(r'~~', is_strikethrough=True), MarkdownTag(r'==', is_underline=True), Token('link', markdown_link, text=MatchGroup('link'), link_target=MatchGroup('url', func=url_complete)), Token('br', newline, text='\n', segment_type="LINE_BREAK") ] return MarkdownParser(tokens)
def get_parser(): boundary_chars = r'\s`!()\[\]{{}};:\'".,<>?«»“”‘’*_~=' b_left = r'(?:(?<=[' + boundary_chars + r'])|(?<=^))' # Lookbehind b_right = r'(?:(?=[' + boundary_chars + r'])|(?=$))' # Lookahead markdown_start = b_left + r'(?<!\\){tag}(?!\s)(?!{tag})' markdown_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b_right markdown_link = r'(?<!\\)\[(?P<link>.+?)\]\((?P<url>.+?)\)' newline = r'\n|\r\n' url_proto_regex = re.compile(r'(?i)^[a-z][\w-]+:/{1,3}') def markdown(tag): """Return sequence of start and end regex patterns for a Markdown tag""" return markdown_start.format(tag=tag), markdown_end.format(tag=tag) def url_complete(url): """If URL doesn't start with protocol, prepend it with http://""" return url if url_proto_regex.search(url) else 'http://' + url tokens = [ Token('bi1', *markdown(r'\*\*\*'), is_bold=True, is_italic=True), Token('bi2', *markdown(r'___'), is_bold=True, is_italic=True), Token('b1', *markdown(r'\*\*'), is_bold=True), Token('b2', *markdown(r'__'), is_bold=True), Token('i1', *markdown(r'\*'), is_italic=True), Token('i2', *markdown(r'_'), is_italic=True), Token('s', *markdown(r'~~'), is_strikethrough=True), Token('u', *markdown(r'=='), is_underline=True), Token('link', markdown_link, text=MatchGroup('link'), link_target=MatchGroup('url', func=url_complete)), Token('br', newline, text='\n', segment_type="LINE_BREAK") ] return Parser(tokens)
class Tokens: """Groups of tokens to be used by ChatMessageParser""" basic = [ Token('link', auto_link, link_target=MatchGroup('start', func=url_complete)), Token('br', newline, text='\n', segment_type=hangouts_pb2.SEGMENT_TYPE_LINE_BREAK) ] markdown = [ Token('md_bi1', *markdown(r'\*\*\*'), is_bold=True, is_italic=True), Token('md_bi2', *markdown(r'___'), is_bold=True, is_italic=True), Token('md_b1', *markdown(r'\*\*'), is_bold=True), Token('md_b2', *markdown(r'__'), is_bold=True), Token('md_i1', *markdown(r'\*'), is_italic=True), Token('md_i2', *markdown(r'_'), is_italic=True), Token('md_pre3', *markdown(r'```'), skip=True), Token('md_pre2', *markdown(r'``'), skip=True), Token('md_pre1', *markdown(r'`'), skip=True), Token('md_s', *markdown(r'~~'), is_strikethrough=True), Token('md_u', *markdown(r'=='), is_underline=True), Token('md_link', markdown_link, text=MatchGroup('link'), link_target=MatchGroup('url', func=url_complete)) ] html = [ Token('html_b1', *html(r'b'), is_bold=True), Token('html_b2', *html(r'strong'), is_bold=True), Token('html_i1', *html(r'i'), is_italic=True), Token('html_i2', *html(r'em'), is_italic=True), Token('html_s1', *html(r's'), is_strikethrough=True), Token('html_s2', *html(r'strike'), is_strikethrough=True), Token('html_s3', *html(r'del'), is_strikethrough=True), Token('html_u1', *html(r'u'), is_underline=True), Token('html_u2', *html(r'ins'), is_underline=True), Token('html_u3', *html(r'mark'), is_underline=True), Token('html_pre', *html(r'pre'), skip=True), Token('html_link', html_link, text=MatchGroup('link'), link_target=MatchGroup('url', func=url_complete)), Token('html_img', html_img, text=MatchGroup('url'), link_target=MatchGroup('url', func=url_complete)), Token('html_br', html_newline, text='\n', segment_type=hangouts_pb2.SEGMENT_TYPE_LINE_BREAK) ]
class Tokens: """Groups of tokens to be used by ChatMessageParser""" basic = [ Token(auto_link, link_target=MatchGroup('text', func=url_complete)), Token(newline, text='\n', segment_type=SegmentType.LINE_BREAK) ] markdown = [ Token(markdown.format(tag=r'\*\*\*'), is_bold=True, is_italic=True), Token(markdown.format(tag=r'___'), is_bold=True, is_italic=True), Token(markdown.format(tag=r'\*\*'), is_bold=True), Token(markdown.format(tag=r'__'), is_bold=True), Token(markdown.format(tag=r'\*'), is_italic=True), Token(markdown.format(tag=r'_'), is_italic=True), Token(markdown.format(tag=r'~~'), is_strikethrough=True), Token(markdown.format(tag=r'=='), is_underline=True), Token(markdown_link, link_target=MatchGroup('url', func=url_complete)) ] html = [ Token(html.format(tag=r'b'), is_bold=True), Token(html.format(tag=r'strong'), is_bold=True), Token(html.format(tag=r'i'), is_italic=True), Token(html.format(tag=r'em'), is_italic=True), Token(html.format(tag=r's'), is_strikethrough=True), Token(html.format(tag=r'strike'), is_strikethrough=True), Token(html.format(tag=r'del'), is_strikethrough=True), Token(html.format(tag=r'u'), is_underline=True), Token(html.format(tag=r'ins'), is_underline=True), Token(html.format(tag=r'mark'), is_underline=True), Token(html_link, link_target=MatchGroup('url', func=url_complete)), Token(html_img, text=MatchGroup('url', func=url_complete), link_target=MatchGroup('url', func=url_complete)), Token(html_newline, text='\n', segment_type=SegmentType.LINE_BREAK) ]
class SlackMessageParser(message_parser.ChatMessageParser): # Tokens to parse Slack's "mrkdwn" formatting into hangups segments. slack_tokens = [ Token("sl_b", *tag(r"\*"), is_bold=True), Token("sl_i", *tag(r"_"), is_italic=True), Token("sl_s", *tag(r"~"), is_strikethrough=True), Token("sl_pre", *tag(r"```"), skip=True), Token("sl_code", *tag(r"`"), skip=True), # Don't use func=message_parser.url_complete here. # We want to preserve Slack-specific targets (e.g. user links). Token("sl_link1", r"<(?P<url>[^>]+?)\|(?P<text>.+?)>", text=MatchGroup("text"), link_target=MatchGroup("url")), Token("sl_link2", r"<(?P<url>.+?)>", text=MatchGroup("url"), link_target=MatchGroup("url")) ] def __init__(self, from_slack): if from_slack: # Take the basic token set, add Slack formatting. super().__init__(message_parser.Tokens.basic + self.slack_tokens) else: # Use hangups' standard tokens for HTML and Markdown. super().__init__() self.bold = "**" if from_slack else "*" self.italic = "_" self.strike = "~" self.from_slack = from_slack def convert(self, source, slack): if isinstance(source, str): # Parse, then convert reparser.Segment to hangups.ChatMessageSegment. segments = [ ChatMessageSegment(seg.text, **seg.params) for seg in self.parse(source) ] else: # We'll assume it's already a ChatMessageSegment list. segments = source formatted = "" current = [] for seg in segments: if seg.type_ == hangouts_pb2.SEGMENT_TYPE_LINE_BREAK: # Insert closing tags for all current formatting, in reverse order. for chars in reversed(current): formatted += chars # Start a new line. formatted += "\n" # Now reinsert the current formatting. for chars in current: formatted += chars continue if self.from_slack: text = seg.text.replace(">", ">").replace("<", "<").replace( "&", "&") if seg.link_target: if seg.link_target[0] == "@": # User link, just replace with the plain username. user = seg.link_target[1:] if user in slack.users: user = slack.users[user]["name"] text = "@{}".format(user) elif seg.link_target[0] == "#": # Channel link, just replace with the plain channel name. channel = seg.link_target[1:] if channel in slack.channels: channel = slack.channels[channel]["name"] text = "#{}".format(channel) else: # Markdown link: [label](target) text = "[{}]({})".format( text, message_parser.url_complete(seg.link_target)) else: text = seg.text.replace("&", "&").replace(">", ">").replace( "<", "<") if seg.link_target: if text == seg.link_target: # Slack implicit link: <target> text = "<{}>".format(seg.link_target) else: # Slack link with label: <target|label> text = "<{}|{}>".format(seg.link_target, text) # Compare formatting of the previous segment to the current one. formatting = { self.bold: seg.is_bold, self.italic: seg.is_italic, self.strike: seg.is_strikethrough } # Insert closing tags for any formatting that ends here. # Apply in reverse order to opened tags. for chars in reversed(current): if not formatting[chars]: formatted += chars current.remove(chars) # Insert opening tags for any formatting that starts here. for chars, condition in formatting.items(): if condition and chars not in current: formatted += chars current.append(chars) # XXX: May generate tags closed in the wrong order: *bold _bold+italic* italic_ # Testing suggests both Slack and Hangouts can cope with this though. formatted += text # Close any remaining format tags. formatted += "".join(reversed(current)) return formatted
def markdown(tag): """Return sequence of start and end regex patterns for simple Markdown tag""" return (markdown_start.format(tag=tag), markdown_end.format(tag=tag)) boundary_chars = r'\s`!\'".,<>?*_~=' b_left = r'(?:(?<=[' + boundary_chars + r'])|(?<=^))' # Lookbehind b_right = r'(?:(?=[' + boundary_chars + r'])|(?=$))' # Lookahead markdown_start = b_left + r'(?<!\\){tag}(?!\s)(?!{tag})' markdown_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b_right tokens = [ Token('b', *markdown(r'\*\*'), bold=True), Token('i', *markdown(r'_'), italic=True), Token('pre', *markdown(r'`'), pre=True) ] parser = Parser(tokens) def hangups_markdown_to_telegram(text, debug=False): lines = text.split("\n") nlines = [] output = "" for line in lines: single_line = "" segments = parser.parse(line)
def markdown1(tag): """Return sequence of start and end regex patterns for simple Markdown tag""" return (markdown1_start.format(tag=tag), markdown1_end.format(tag=tag)) boundary1_chars = r'\s`!\'".,<>?*_~=' # slack to hangups b1_left = r'(?:(?<=[' + boundary1_chars + r'])|(?<=^))' b1_right = r'(?:(?=[' + boundary1_chars + r'])|(?=$))' markdown1_start = b1_left + r'(?<!\\){tag}(?!\s)(?!{tag})' markdown1_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b1_right tokens_slack_to_hangups = [ Token('b', *markdown1(r'\*'), is_bold=True), Token('i', *markdown1(r'_'), is_italic=True), Token('pre1', *markdown1(r'`'), skip=True), Token('pre2', *markdown1(r'```'), skip=True) ] parser_slack_to_hangups = Parser(tokens_slack_to_hangups) # hangups to slack def markdown2(tag): """Return sequence of start and end regex patterns for simple Markdown tag""" return (markdown2_start.format(tag=tag), markdown2_end.format(tag=tag))