Exemple #1
0
def get_parser():
    markdown_link = r'(?<!\\)\[(?P<link>.+?)\]\((?P<url>.+?)\)'
    newline = r'\n|\r\n'
    url_proto_regex = re.compile(r'(?i)^[a-z][\w-]+:/{1,3}')

    def url_complete(url):
        """If URL doesn't start with protocol, prepend it with http://"""
        return url if url_proto_regex.search(url) else 'http://' + url

    tokens = [
        MarkdownTag(r'\*\*\*', is_bold=True, is_italic=True),
        MarkdownTag(r'___', is_bold=True, is_italic=True),
        MarkdownTag(r'\*\*', is_bold=True),
        MarkdownTag(r'__', is_bold=True),
        MarkdownTag(r'\*', is_italic=True),
        MarkdownTag(r'_', is_italic=True),
        MarkdownTag(r'```', skip=True),
        MarkdownTag(r'``', skip=True),
        MarkdownTag(r'`', skip=True),
        MarkdownTag(r'~~', is_strikethrough=True),
        MarkdownTag(r'==', is_underline=True),
        Token('link',
              markdown_link,
              text=MatchGroup('link'),
              link_target=MatchGroup('url', func=url_complete)),
        Token('br', newline, text='\n', segment_type="LINE_BREAK")
    ]

    return MarkdownParser(tokens)
Exemple #2
0
def get_parser():
    boundary_chars = r'\s`!()\[\]{{}};:\'".,<>?«»“”‘’*_~='
    b_left = r'(?:(?<=[' + boundary_chars + r'])|(?<=^))'  # Lookbehind
    b_right = r'(?:(?=[' + boundary_chars + r'])|(?=$))'  # Lookahead

    markdown_start = b_left + r'(?<!\\){tag}(?!\s)(?!{tag})'
    markdown_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b_right
    markdown_link = r'(?<!\\)\[(?P<link>.+?)\]\((?P<url>.+?)\)'
    newline = r'\n|\r\n'

    url_proto_regex = re.compile(r'(?i)^[a-z][\w-]+:/{1,3}')

    def markdown(tag):
        """Return sequence of start and end regex patterns for a Markdown tag"""
        return markdown_start.format(tag=tag), markdown_end.format(tag=tag)

    def url_complete(url):
        """If URL doesn't start with protocol, prepend it with http://"""
        return url if url_proto_regex.search(url) else 'http://' + url

    tokens = [
        Token('bi1', *markdown(r'\*\*\*'), is_bold=True, is_italic=True),
        Token('bi2', *markdown(r'___'), is_bold=True, is_italic=True),
        Token('b1', *markdown(r'\*\*'), is_bold=True),
        Token('b2', *markdown(r'__'), is_bold=True),
        Token('i1', *markdown(r'\*'), is_italic=True),
        Token('i2', *markdown(r'_'), is_italic=True),
        Token('s', *markdown(r'~~'), is_strikethrough=True),
        Token('u', *markdown(r'=='), is_underline=True),
        Token('link',
              markdown_link,
              text=MatchGroup('link'),
              link_target=MatchGroup('url', func=url_complete)),
        Token('br', newline, text='\n', segment_type="LINE_BREAK")
    ]

    return Parser(tokens)
Exemple #3
0
class Tokens:
    """Groups of tokens to be used by ChatMessageParser"""
    basic = [
        Token('link',
              auto_link,
              link_target=MatchGroup('start', func=url_complete)),
        Token('br',
              newline,
              text='\n',
              segment_type=hangouts_pb2.SEGMENT_TYPE_LINE_BREAK)
    ]

    markdown = [
        Token('md_bi1', *markdown(r'\*\*\*'), is_bold=True, is_italic=True),
        Token('md_bi2', *markdown(r'___'), is_bold=True, is_italic=True),
        Token('md_b1', *markdown(r'\*\*'), is_bold=True),
        Token('md_b2', *markdown(r'__'), is_bold=True),
        Token('md_i1', *markdown(r'\*'), is_italic=True),
        Token('md_i2', *markdown(r'_'), is_italic=True),
        Token('md_pre3', *markdown(r'```'), skip=True),
        Token('md_pre2', *markdown(r'``'), skip=True),
        Token('md_pre1', *markdown(r'`'), skip=True),
        Token('md_s', *markdown(r'~~'), is_strikethrough=True),
        Token('md_u', *markdown(r'=='), is_underline=True),
        Token('md_link',
              markdown_link,
              text=MatchGroup('link'),
              link_target=MatchGroup('url', func=url_complete))
    ]

    html = [
        Token('html_b1', *html(r'b'), is_bold=True),
        Token('html_b2', *html(r'strong'), is_bold=True),
        Token('html_i1', *html(r'i'), is_italic=True),
        Token('html_i2', *html(r'em'), is_italic=True),
        Token('html_s1', *html(r's'), is_strikethrough=True),
        Token('html_s2', *html(r'strike'), is_strikethrough=True),
        Token('html_s3', *html(r'del'), is_strikethrough=True),
        Token('html_u1', *html(r'u'), is_underline=True),
        Token('html_u2', *html(r'ins'), is_underline=True),
        Token('html_u3', *html(r'mark'), is_underline=True),
        Token('html_pre', *html(r'pre'), skip=True),
        Token('html_link',
              html_link,
              text=MatchGroup('link'),
              link_target=MatchGroup('url', func=url_complete)),
        Token('html_img',
              html_img,
              text=MatchGroup('url'),
              link_target=MatchGroup('url', func=url_complete)),
        Token('html_br',
              html_newline,
              text='\n',
              segment_type=hangouts_pb2.SEGMENT_TYPE_LINE_BREAK)
    ]
Exemple #4
0
class Tokens:
    """Groups of tokens to be used by ChatMessageParser"""
    basic = [
        Token(auto_link, link_target=MatchGroup('text', func=url_complete)),
        Token(newline, text='\n', segment_type=SegmentType.LINE_BREAK)
    ]

    markdown = [
        Token(markdown.format(tag=r'\*\*\*'), is_bold=True, is_italic=True),
        Token(markdown.format(tag=r'___'), is_bold=True, is_italic=True),
        Token(markdown.format(tag=r'\*\*'), is_bold=True),
        Token(markdown.format(tag=r'__'), is_bold=True),
        Token(markdown.format(tag=r'\*'), is_italic=True),
        Token(markdown.format(tag=r'_'), is_italic=True),
        Token(markdown.format(tag=r'~~'), is_strikethrough=True),
        Token(markdown.format(tag=r'=='), is_underline=True),
        Token(markdown_link, link_target=MatchGroup('url', func=url_complete))
    ]

    html = [
        Token(html.format(tag=r'b'), is_bold=True),
        Token(html.format(tag=r'strong'), is_bold=True),
        Token(html.format(tag=r'i'), is_italic=True),
        Token(html.format(tag=r'em'), is_italic=True),
        Token(html.format(tag=r's'), is_strikethrough=True),
        Token(html.format(tag=r'strike'), is_strikethrough=True),
        Token(html.format(tag=r'del'), is_strikethrough=True),
        Token(html.format(tag=r'u'), is_underline=True),
        Token(html.format(tag=r'ins'), is_underline=True),
        Token(html.format(tag=r'mark'), is_underline=True),
        Token(html_link, link_target=MatchGroup('url', func=url_complete)),
        Token(html_img,
              text=MatchGroup('url', func=url_complete),
              link_target=MatchGroup('url', func=url_complete)),
        Token(html_newline, text='\n', segment_type=SegmentType.LINE_BREAK)
    ]
Exemple #5
0
class SlackMessageParser(message_parser.ChatMessageParser):

    # Tokens to parse Slack's "mrkdwn" formatting into hangups segments.
    slack_tokens = [
        Token("sl_b", *tag(r"\*"), is_bold=True),
        Token("sl_i", *tag(r"_"), is_italic=True),
        Token("sl_s", *tag(r"~"), is_strikethrough=True),
        Token("sl_pre", *tag(r"```"), skip=True),
        Token("sl_code", *tag(r"`"), skip=True),
        # Don't use func=message_parser.url_complete here.
        # We want to preserve Slack-specific targets (e.g. user links).
        Token("sl_link1",
              r"<(?P<url>[^>]+?)\|(?P<text>.+?)>",
              text=MatchGroup("text"),
              link_target=MatchGroup("url")),
        Token("sl_link2",
              r"<(?P<url>.+?)>",
              text=MatchGroup("url"),
              link_target=MatchGroup("url"))
    ]

    def __init__(self, from_slack):
        if from_slack:
            # Take the basic token set, add Slack formatting.
            super().__init__(message_parser.Tokens.basic + self.slack_tokens)
        else:
            # Use hangups' standard tokens for HTML and Markdown.
            super().__init__()
        self.bold = "**" if from_slack else "*"
        self.italic = "_"
        self.strike = "~"
        self.from_slack = from_slack

    def convert(self, source, slack):
        if isinstance(source, str):
            # Parse, then convert reparser.Segment to hangups.ChatMessageSegment.
            segments = [
                ChatMessageSegment(seg.text, **seg.params)
                for seg in self.parse(source)
            ]
        else:
            # We'll assume it's already a ChatMessageSegment list.
            segments = source
        formatted = ""
        current = []
        for seg in segments:
            if seg.type_ == hangouts_pb2.SEGMENT_TYPE_LINE_BREAK:
                # Insert closing tags for all current formatting, in reverse order.
                for chars in reversed(current):
                    formatted += chars
                # Start a new line.
                formatted += "\n"
                # Now reinsert the current formatting.
                for chars in current:
                    formatted += chars
                continue
            if self.from_slack:
                text = seg.text.replace("&gt;", ">").replace("&lt;",
                                                             "<").replace(
                                                                 "&amp;", "&")
                if seg.link_target:
                    if seg.link_target[0] == "@":
                        # User link, just replace with the plain username.
                        user = seg.link_target[1:]
                        if user in slack.users:
                            user = slack.users[user]["name"]
                        text = "@{}".format(user)
                    elif seg.link_target[0] == "#":
                        # Channel link, just replace with the plain channel name.
                        channel = seg.link_target[1:]
                        if channel in slack.channels:
                            channel = slack.channels[channel]["name"]
                        text = "#{}".format(channel)
                    else:
                        # Markdown link: [label](target)
                        text = "[{}]({})".format(
                            text, message_parser.url_complete(seg.link_target))
            else:
                text = seg.text.replace("&", "&amp;").replace(">",
                                                              "&gt;").replace(
                                                                  "<", "&lt;")
                if seg.link_target:
                    if text == seg.link_target:
                        # Slack implicit link: <target>
                        text = "<{}>".format(seg.link_target)
                    else:
                        # Slack link with label: <target|label>
                        text = "<{}|{}>".format(seg.link_target, text)
            # Compare formatting of the previous segment to the current one.
            formatting = {
                self.bold: seg.is_bold,
                self.italic: seg.is_italic,
                self.strike: seg.is_strikethrough
            }
            # Insert closing tags for any formatting that ends here.
            # Apply in reverse order to opened tags.
            for chars in reversed(current):
                if not formatting[chars]:
                    formatted += chars
                    current.remove(chars)
            # Insert opening tags for any formatting that starts here.
            for chars, condition in formatting.items():
                if condition and chars not in current:
                    formatted += chars
                    current.append(chars)
            # XXX: May generate tags closed in the wrong order: *bold _bold+italic* italic_
            # Testing suggests both Slack and Hangouts can cope with this though.
            formatted += text
        # Close any remaining format tags.
        formatted += "".join(reversed(current))
        return formatted
Exemple #6
0
def markdown(tag):
    """Return sequence of start and end regex patterns for simple Markdown tag"""
    return (markdown_start.format(tag=tag), markdown_end.format(tag=tag))


boundary_chars = r'\s`!\'".,<>?*_~='

b_left = r'(?:(?<=[' + boundary_chars + r'])|(?<=^))'  # Lookbehind
b_right = r'(?:(?=[' + boundary_chars + r'])|(?=$))'  # Lookahead

markdown_start = b_left + r'(?<!\\){tag}(?!\s)(?!{tag})'
markdown_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b_right

tokens = [
    Token('b', *markdown(r'\*\*'), bold=True),
    Token('i', *markdown(r'_'), italic=True),
    Token('pre', *markdown(r'`'), pre=True)
]

parser = Parser(tokens)


def hangups_markdown_to_telegram(text, debug=False):
    lines = text.split("\n")
    nlines = []
    output = ""
    for line in lines:
        single_line = ""

        segments = parser.parse(line)
Exemple #7
0
def markdown1(tag):
    """Return sequence of start and end regex patterns for simple Markdown tag"""
    return (markdown1_start.format(tag=tag), markdown1_end.format(tag=tag))


boundary1_chars = r'\s`!\'".,<>?*_~='  # slack to hangups

b1_left = r'(?:(?<=[' + boundary1_chars + r'])|(?<=^))'
b1_right = r'(?:(?=[' + boundary1_chars + r'])|(?=$))'

markdown1_start = b1_left + r'(?<!\\){tag}(?!\s)(?!{tag})'
markdown1_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b1_right

tokens_slack_to_hangups = [
    Token('b', *markdown1(r'\*'), is_bold=True),
    Token('i', *markdown1(r'_'), is_italic=True),
    Token('pre1', *markdown1(r'`'), skip=True),
    Token('pre2', *markdown1(r'```'), skip=True)
]

parser_slack_to_hangups = Parser(tokens_slack_to_hangups)

# hangups to slack


def markdown2(tag):
    """Return sequence of start and end regex patterns for simple Markdown tag"""
    return (markdown2_start.format(tag=tag), markdown2_end.format(tag=tag))