Esempio n. 1
0
def parse_url(word):
    """
    If word is url, return a parsed version of it.
    :param word: string
    :return: None or parsed url
    """
    url = None
    if simple_url_re.match(word):
        url = smart_urlquote(word)
    elif simple_url_2_re.match(word):
        url = smart_urlquote('http://%s' % word)
    elif not ':' in word and simple_email_re.match(word):
        local, domain = word.rsplit('@', 1)
        try:
            domain = domain.encode('idna').decode('ascii')
        except UnicodeError:
            return

        url = 'mailto:%s@%s' % (local, domain)

    if url:
        return urlparse.urlparse(url)
Esempio n. 2
0
def parse_url(word):
    """
    If word is url, return a parsed version of it.
    :param word: string
    :return: None or parsed url
    """
    url = None
    if simple_url_re.match(word):
        url = smart_urlquote(word)
    elif simple_url_2_re.match(word):
        url = smart_urlquote('http://%s' % word)
    elif not ':' in word and simple_email_re.match(word):
        local, domain = word.rsplit('@', 1)
        try:
            domain = domain.encode('idna').decode('ascii')
        except UnicodeError:
            return

        url = 'mailto:%s@%s' % (local, domain)

    if url:
        return urlparse.urlparse(url)
Esempio n. 3
0
def convert_links(text, trim_url_limit=None, nofollow=False, autoescape=False):
    """
    Finds URLs in text and attempts to handle correctly.
    Heavily based on django.utils.html.urlize
    With the additions of attempting to embed media links, particularly images.

    Works on http://, https://, www. links, and also on links ending in one of
    the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).

    Links can have trailing punctuation (periods, commas, close-parens) and
    leading punctuation (opening parens) and it'll still do the right thing.

    """

    safe_input = isinstance(text, SafeData)
    words = word_split_re.split(force_text(text))
    for i, word in enumerate(words):
        if '.' in word or ':' in word:
            # Deal with punctuation.
            lead, middle, trail = '', word, ''
            for punctuation in TRAILING_PUNCTUATION:
                if middle.endswith(punctuation):
                    middle = middle[:-len(punctuation)]
                    trail = punctuation + trail
            for opening, closing in WRAPPING_PUNCTUATION:
                if middle.startswith(opening):
                    middle = middle[len(opening):]
                    lead = lead + opening
                # Keep parentheses at the end only if they're balanced.
                if (middle.endswith(closing)
                    and middle.count(closing) == middle.count(opening) + 1):
                    middle = middle[:-len(closing)]
                    trail = closing + trail

            # Make URL we want to point to.
            url = None
            if simple_url_re.match(middle):
                url = smart_urlquote(middle)
            elif simple_url_2_re.match(middle):
                url = smart_urlquote('http://%s' % middle)
            elif not ':' in middle and simple_email_re.match(middle):
                local, domain = middle.rsplit('@', 1)
                try:
                    domain = domain.encode('idna').decode('ascii')
                except UnicodeError:
                    continue
            if url:
                u = url.lower()
                if autoescape and not safe_input:
                    lead, trail = escape(lead), escape(trail)
                    url = escape(url)

                # Photos
                if u.endswith('.jpg') or u.endswith('.gif') or u.endswith('.png'):
                    middle = '<img src="%s">' % url

                # Youtube
                #'https://www.youtube.com/watch?v=gkqXgaUuxZg'
                elif 'youtube.com/watch' in url:
                    parsed = urlparse.urlsplit(url)
                    query  = urlparse.parse_qs(parsed.query)
                    token  = query.get('v')
                    if token and len(token) > 0:
                        middle = '<iframe src="http://www.youtube.com/embed/%s" height="320" width="100%%"></iframe>' % token[0]
                    else:
                        middle = url
                elif 'youtu.be/' in url:
                    try:
                        token = url.rsplit('/', 1)[1]
                        middle = '<iframe src="http://www.youtube.com/embed/' + token + '" height="320" width="100%%"></iframe>'
                    except IndexError:
                        middle = url

                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
            else:
                if safe_input:
                    words[i] = mark_safe(word)
                elif autoescape:
                    words[i] = escape(word)
        elif safe_input:
            words[i] = mark_safe(word)
        elif autoescape:
            words[i] = escape(word)
    return ''.join(words)
Esempio n. 4
0
def convert_links(text, trim_url_limit=None, nofollow=False, autoescape=False):
    """
    Finds URLs in text and attempts to handle correctly.
    Heavily based on django.utils.html.urlize
    With the additions of attempting to embed media links, particularly images.

    Works on http://, https://, www. links, and also on links ending in one of
    the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).

    Links can have trailing punctuation (periods, commas, close-parens) and
    leading punctuation (opening parens) and it'll still do the right thing.
    
    TO-DO: refactor to better leverage existing django.utils.html

    """

    safe_input = isinstance(text, SafeData)
    words = word_split_re.split(force_text(text))
    for i, word in enumerate(words):
        if '.' in word or ':' in word:
            # Deal with punctuation.
            lead, middle, trail = '', word, ''
            stripped = middle.rstrip(TRAILING_PUNCTUATION_CHARS)
            if middle != stripped:
                trail = middle[len(stripped):] + trail
                middle = stripped
            for opening, closing in WRAPPING_PUNCTUATION:
                if middle.startswith(opening):
                    middle = middle[len(opening):]
                    lead = lead + opening
                # Keep parentheses at the end only if they're balanced.
                if (middle.endswith(closing)
                    and middle.count(closing) == middle.count(opening) + 1):
                    middle = middle[:-len(closing)]
                    trail = closing + trail

            # Make URL we want to point to.
            url = None
            if simple_url_re.match(middle):
                url = smart_urlquote(middle)
            elif simple_url_2_re.match(middle):
                url = smart_urlquote('http://%s' % middle)
            elif ':' not in middle and is_email_simple(middle):
                local, domain = middle.rsplit('@', 1)
                try:
                    domain = domain.encode('idna').decode('ascii')
                except UnicodeError:
                    continue
            if url:
                u = url.lower()
                if autoescape and not safe_input:
                    lead, trail = escape(lead), escape(trail)
                    url = escape(url)

                # Photos
                if u.endswith('.jpg') or u.endswith('.gif') or u.endswith('.png'):
                    middle = '<img src="%s">' % url

                # Youtube
                #'https://www.youtube.com/watch?v=gkqXgaUuxZg'
                elif 'youtube.com/watch' in url:
                    parsed = urlparse.urlsplit(url)
                    query  = urlparse.parse_qs(parsed.query)
                    token  = query.get('v')
                    if token and len(token) > 0:
                        middle = '<iframe src="http://www.youtube.com/embed/%s" height="320" width="100%%"></iframe>' % token[0]
                    else:
                        middle = url
                elif 'youtu.be/' in url:
                    try:
                        token = url.rsplit('/', 1)[1]
                        middle = '<iframe src="http://www.youtube.com/embed/%s" height="320" width="100%%"></iframe>' % token
                    except IndexError:
                        middle = six.u(url)

                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
            else:
                if safe_input:
                    words[i] = mark_safe(word)
                elif autoescape:
                    words[i] = escape(word)
        elif safe_input:
            words[i] = mark_safe(word)
        elif autoescape:
            words[i] = escape(word)
    return ''.join(words)
def _urlize_all_text(text,
                     trim_url_limit=None,
                     nofollow=False,
                     autoescape=False):
    """
    Convert any URLs in text into clickable links.

    Works on http://, https://, www. links, and also on links ending in one of
    the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
    Links can have trailing punctuation (periods, commas, close-parens) and
    leading punctuation (opening parens) and it'll still do the right thing.

    If trim_url_limit is not None, truncate the URLs in the link text longer
    than this limit to trim_url_limit - 1 characters and append an ellipsis.

    If nofollow is True, give the links a rel="nofollow" attribute.

    If autoescape is True, autoescape the link text and URLs.
    """
    safe_input = isinstance(text, SafeData)

    def trim_url(x, limit=trim_url_limit):
        if limit is None or len(x) <= limit:
            return x
        return '%s…' % x[:max(0, limit - 1)]

    def trim_punctuation(lead, middle, trail):
        """
        Trim trailing and wrapping punctuation from `middle`. Return the items
        of the new state.
        """
        # Continue trimming until middle remains unchanged.
        trimmed_something = True
        while trimmed_something:
            trimmed_something = False
            # Trim wrapping punctuation.
            for opening, closing in WRAPPING_PUNCTUATION:
                if middle.startswith(opening):
                    middle = middle[len(opening):]
                    lead += opening
                    trimmed_something = True
                # Keep parentheses at the end only if they're balanced.
                if middle.endswith(closing) and middle.count(
                        closing) == middle.count(opening) + 1:
                    middle = middle[:-len(closing)]
                    trail = closing + trail
                    trimmed_something = True
            # Trim trailing punctuation (after trimming wrapping punctuation,
            # as encoded entities contain ';'). Unescape entities to avoid
            # breaking them by removing ';'.
            middle_unescaped = html.unescape(middle)
            stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
            if middle_unescaped != stripped:
                trail = middle[len(stripped):] + trail
                middle = middle[:len(stripped) - len(middle_unescaped)]
                trimmed_something = True
        return lead, middle, trail

    def is_email_simple(value):
        """Return True if value looks like an email address."""
        # An @ must be in the middle of the value.
        if '@' not in value or value.startswith('@') or value.endswith('@'):
            return False
        try:
            p1, p2 = value.split('@')
        except ValueError:
            # value contains more than one @.
            return False
        # Dot must be in p2 (e.g. example.com)
        if '.' not in p2 or p2.startswith('.'):
            return False
        return True

    words = word_split_re.split(str(text))
    for i, word in enumerate(words):
        if '.' in word or '@' in word or ':' in word:
            # lead: Current punctuation trimmed from the beginning of the word.
            # middle: Current state of the word.
            # trail: Current punctuation trimmed from the end of the word.
            lead, middle, trail = '', word, ''
            # Deal with punctuation.
            lead, middle, trail = trim_punctuation(lead, middle, trail)

            # Make URL we want to point to.
            url = None
            nofollow_attr = ' rel="noopener noreferrer nofollow"' if nofollow else ''
            if simple_url_re.match(middle):
                url = smart_urlquote(html.unescape(middle))
            elif simple_url_2_re.match(middle):
                url = smart_urlquote('http://%s' % html.unescape(middle))
            elif ':' not in middle and is_email_simple(middle):
                local, domain = middle.rsplit('@', 1)
                try:
                    domain = punycode(domain)  # type: ignore
                except UnicodeError:
                    continue
                url = 'mailto:%s@%s' % (local, domain)
                nofollow_attr = ''

            # Make link.
            if url:
                trimmed = trim_url(middle)
                if autoescape and not safe_input:
                    lead, trail = html.escape(lead), html.escape(trail)
                    trimmed = html.escape(trimmed)
                middle = '<a href="%s"%s>%s</a>' % (html.escape(url),
                                                    nofollow_attr, trimmed)
                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
            else:
                if safe_input:
                    words[i] = mark_safe(word)
                elif autoescape:
                    words[i] = html.escape(word)
        elif safe_input:
            words[i] = mark_safe(word)
        elif autoescape:
            words[i] = html.escape(word)
    return ''.join(words)
Esempio n. 6
0
def urlize_impl(text, trim_url_limit=None, nofollow=False, autoescape=False):
    """
    Converts any URLs in text into clickable links.

    Works on http://, https://, www. links, and also on links ending in one of
    the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
    Links can have trailing punctuation (periods, commas, close-parens) and
    leading punctuation (opening parens) and it'll still do the right thing.

    If trim_url_limit is not None, the URLs in link text longer than this limit
    will truncated to trim_url_limit-3 characters and appended with an elipsis.

    If nofollow is True, the URLs in link text will get a rel="nofollow"
    attribute.

    If autoescape is True, the link text and URLs will get autoescaped.
    """
    trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
    safe_input = isinstance(text, SafeData)
    words = word_split_re.split(force_text(text))
    for i, word in enumerate(words):
        match = None
        if '.' in word or '@' in word or ':' in word:
            # Deal with punctuation.
            lead, middle, trail = '', word, ''
            for punctuation in TRAILING_PUNCTUATION:
                if middle.endswith(punctuation):
                    middle = middle[:-len(punctuation)]
                    trail = punctuation + trail
            for opening, closing in WRAPPING_PUNCTUATION:
                if middle.startswith(opening):
                    middle = middle[len(opening):]
                    lead = lead + opening
                # Keep parentheses at the end only if they're balanced.
                if (middle.endswith(closing)
                    and middle.count(closing) == middle.count(opening) + 1):
                    middle = middle[:-len(closing)]
                    trail = closing + trail

            # Make URL we want to point to.
            url = None
            nofollow_attr = ' rel="nofollow"' if nofollow else ''
            if simple_url_re.match(middle):
                url = smart_urlquote(middle)
            elif simple_url_2_re.match(middle):
                url = smart_urlquote('http://%s' % middle)
            elif not ':' in middle and simple_email_re.match(middle):
                local, domain = middle.rsplit('@', 1)
                try:
                    domain = domain.encode('idna').decode('ascii')
                except UnicodeError:
                    continue
                url = 'mailto:%s@%s' % (local, domain)
                nofollow_attr = ''

            # Make link.
            if url:
                trimmed = trim_url(middle)
                if autoescape and not safe_input:
                    lead, trail = escape(lead), escape(trail)
                    url, trimmed = escape(url), escape(trimmed)
                #
                # Custom stuff for us
                #
                lowered = url.lower()
                is_image = (lowered.endswith('.jpg') or lowered.endswith('.gif')
                            or lowered.endswith('.png'))
                class_attr = is_image and ' class="image"' or ''
                middle = '<a href="%s"%s%s>%s</a>' % (url, nofollow_attr,
                                                      class_attr, trimmed)
                #
                # End custom stuff
                #
                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
            else:
                if safe_input:
                    words[i] = mark_safe(word)
                elif autoescape:
                    words[i] = escape(word)
        elif safe_input:
            words[i] = mark_safe(word)
        elif autoescape:
            words[i] = escape(word)
    return ''.join(words)