def parse_url(word): """ If word is url, return a parsed version of it. :param word: string :return: None or parsed url """ url = None if simple_url_re.match(word): url = smart_urlquote(word) elif simple_url_2_re.match(word): url = smart_urlquote('http://%s' % word) elif not ':' in word and simple_email_re.match(word): local, domain = word.rsplit('@', 1) try: domain = domain.encode('idna').decode('ascii') except UnicodeError: return url = 'mailto:%s@%s' % (local, domain) if url: return urlparse.urlparse(url)
def convert_links(text, trim_url_limit=None, nofollow=False, autoescape=False): """ Finds URLs in text and attempts to handle correctly. Heavily based on django.utils.html.urlize With the additions of attempting to embed media links, particularly images. Works on http://, https://, www. links, and also on links ending in one of the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). Links can have trailing punctuation (periods, commas, close-parens) and leading punctuation (opening parens) and it'll still do the right thing. """ safe_input = isinstance(text, SafeData) words = word_split_re.split(force_text(text)) for i, word in enumerate(words): if '.' in word or ':' in word: # Deal with punctuation. lead, middle, trail = '', word, '' for punctuation in TRAILING_PUNCTUATION: if middle.endswith(punctuation): middle = middle[:-len(punctuation)] trail = punctuation + trail for opening, closing in WRAPPING_PUNCTUATION: if middle.startswith(opening): middle = middle[len(opening):] lead = lead + opening # Keep parentheses at the end only if they're balanced. if (middle.endswith(closing) and middle.count(closing) == middle.count(opening) + 1): middle = middle[:-len(closing)] trail = closing + trail # Make URL we want to point to. url = None if simple_url_re.match(middle): url = smart_urlquote(middle) elif simple_url_2_re.match(middle): url = smart_urlquote('http://%s' % middle) elif not ':' in middle and simple_email_re.match(middle): local, domain = middle.rsplit('@', 1) try: domain = domain.encode('idna').decode('ascii') except UnicodeError: continue if url: u = url.lower() if autoescape and not safe_input: lead, trail = escape(lead), escape(trail) url = escape(url) # Photos if u.endswith('.jpg') or u.endswith('.gif') or u.endswith('.png'): middle = '<img src="%s">' % url # Youtube #'https://www.youtube.com/watch?v=gkqXgaUuxZg' elif 'youtube.com/watch' in url: parsed = urlparse.urlsplit(url) query = urlparse.parse_qs(parsed.query) token = query.get('v') if token and len(token) > 0: middle = '<iframe src="http://www.youtube.com/embed/%s" height="320" width="100%%"></iframe>' % token[0] else: middle = url elif 'youtu.be/' in url: try: token = url.rsplit('/', 1)[1] middle = '<iframe src="http://www.youtube.com/embed/' + token + '" height="320" width="100%%"></iframe>' except IndexError: middle = url words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) else: if safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) elif safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) return ''.join(words)
def convert_links(text, trim_url_limit=None, nofollow=False, autoescape=False): """ Finds URLs in text and attempts to handle correctly. Heavily based on django.utils.html.urlize With the additions of attempting to embed media links, particularly images. Works on http://, https://, www. links, and also on links ending in one of the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). Links can have trailing punctuation (periods, commas, close-parens) and leading punctuation (opening parens) and it'll still do the right thing. TO-DO: refactor to better leverage existing django.utils.html """ safe_input = isinstance(text, SafeData) words = word_split_re.split(force_text(text)) for i, word in enumerate(words): if '.' in word or ':' in word: # Deal with punctuation. lead, middle, trail = '', word, '' stripped = middle.rstrip(TRAILING_PUNCTUATION_CHARS) if middle != stripped: trail = middle[len(stripped):] + trail middle = stripped for opening, closing in WRAPPING_PUNCTUATION: if middle.startswith(opening): middle = middle[len(opening):] lead = lead + opening # Keep parentheses at the end only if they're balanced. if (middle.endswith(closing) and middle.count(closing) == middle.count(opening) + 1): middle = middle[:-len(closing)] trail = closing + trail # Make URL we want to point to. url = None if simple_url_re.match(middle): url = smart_urlquote(middle) elif simple_url_2_re.match(middle): url = smart_urlquote('http://%s' % middle) elif ':' not in middle and is_email_simple(middle): local, domain = middle.rsplit('@', 1) try: domain = domain.encode('idna').decode('ascii') except UnicodeError: continue if url: u = url.lower() if autoescape and not safe_input: lead, trail = escape(lead), escape(trail) url = escape(url) # Photos if u.endswith('.jpg') or u.endswith('.gif') or u.endswith('.png'): middle = '<img src="%s">' % url # Youtube #'https://www.youtube.com/watch?v=gkqXgaUuxZg' elif 'youtube.com/watch' in url: parsed = urlparse.urlsplit(url) query = urlparse.parse_qs(parsed.query) token = query.get('v') if token and len(token) > 0: middle = '<iframe src="http://www.youtube.com/embed/%s" height="320" width="100%%"></iframe>' % token[0] else: middle = url elif 'youtu.be/' in url: try: token = url.rsplit('/', 1)[1] middle = '<iframe src="http://www.youtube.com/embed/%s" height="320" width="100%%"></iframe>' % token except IndexError: middle = six.u(url) words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) else: if safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) elif safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) return ''.join(words)
def _urlize_all_text(text, trim_url_limit=None, nofollow=False, autoescape=False): """ Convert any URLs in text into clickable links. Works on http://, https://, www. links, and also on links ending in one of the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). Links can have trailing punctuation (periods, commas, close-parens) and leading punctuation (opening parens) and it'll still do the right thing. If trim_url_limit is not None, truncate the URLs in the link text longer than this limit to trim_url_limit - 1 characters and append an ellipsis. If nofollow is True, give the links a rel="nofollow" attribute. If autoescape is True, autoescape the link text and URLs. """ safe_input = isinstance(text, SafeData) def trim_url(x, limit=trim_url_limit): if limit is None or len(x) <= limit: return x return '%s…' % x[:max(0, limit - 1)] def trim_punctuation(lead, middle, trail): """ Trim trailing and wrapping punctuation from `middle`. Return the items of the new state. """ # Continue trimming until middle remains unchanged. trimmed_something = True while trimmed_something: trimmed_something = False # Trim wrapping punctuation. for opening, closing in WRAPPING_PUNCTUATION: if middle.startswith(opening): middle = middle[len(opening):] lead += opening trimmed_something = True # Keep parentheses at the end only if they're balanced. if middle.endswith(closing) and middle.count( closing) == middle.count(opening) + 1: middle = middle[:-len(closing)] trail = closing + trail trimmed_something = True # Trim trailing punctuation (after trimming wrapping punctuation, # as encoded entities contain ';'). Unescape entities to avoid # breaking them by removing ';'. middle_unescaped = html.unescape(middle) stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS) if middle_unescaped != stripped: trail = middle[len(stripped):] + trail middle = middle[:len(stripped) - len(middle_unescaped)] trimmed_something = True return lead, middle, trail def is_email_simple(value): """Return True if value looks like an email address.""" # An @ must be in the middle of the value. if '@' not in value or value.startswith('@') or value.endswith('@'): return False try: p1, p2 = value.split('@') except ValueError: # value contains more than one @. return False # Dot must be in p2 (e.g. example.com) if '.' not in p2 or p2.startswith('.'): return False return True words = word_split_re.split(str(text)) for i, word in enumerate(words): if '.' in word or '@' in word or ':' in word: # lead: Current punctuation trimmed from the beginning of the word. # middle: Current state of the word. # trail: Current punctuation trimmed from the end of the word. lead, middle, trail = '', word, '' # Deal with punctuation. lead, middle, trail = trim_punctuation(lead, middle, trail) # Make URL we want to point to. url = None nofollow_attr = ' rel="noopener noreferrer nofollow"' if nofollow else '' if simple_url_re.match(middle): url = smart_urlquote(html.unescape(middle)) elif simple_url_2_re.match(middle): url = smart_urlquote('http://%s' % html.unescape(middle)) elif ':' not in middle and is_email_simple(middle): local, domain = middle.rsplit('@', 1) try: domain = punycode(domain) # type: ignore except UnicodeError: continue url = 'mailto:%s@%s' % (local, domain) nofollow_attr = '' # Make link. if url: trimmed = trim_url(middle) if autoescape and not safe_input: lead, trail = html.escape(lead), html.escape(trail) trimmed = html.escape(trimmed) middle = '<a href="%s"%s>%s</a>' % (html.escape(url), nofollow_attr, trimmed) words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) else: if safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = html.escape(word) elif safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = html.escape(word) return ''.join(words)
def urlize_impl(text, trim_url_limit=None, nofollow=False, autoescape=False): """ Converts any URLs in text into clickable links. Works on http://, https://, www. links, and also on links ending in one of the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). Links can have trailing punctuation (periods, commas, close-parens) and leading punctuation (opening parens) and it'll still do the right thing. If trim_url_limit is not None, the URLs in link text longer than this limit will truncated to trim_url_limit-3 characters and appended with an elipsis. If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. If autoescape is True, the link text and URLs will get autoescaped. """ trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x safe_input = isinstance(text, SafeData) words = word_split_re.split(force_text(text)) for i, word in enumerate(words): match = None if '.' in word or '@' in word or ':' in word: # Deal with punctuation. lead, middle, trail = '', word, '' for punctuation in TRAILING_PUNCTUATION: if middle.endswith(punctuation): middle = middle[:-len(punctuation)] trail = punctuation + trail for opening, closing in WRAPPING_PUNCTUATION: if middle.startswith(opening): middle = middle[len(opening):] lead = lead + opening # Keep parentheses at the end only if they're balanced. if (middle.endswith(closing) and middle.count(closing) == middle.count(opening) + 1): middle = middle[:-len(closing)] trail = closing + trail # Make URL we want to point to. url = None nofollow_attr = ' rel="nofollow"' if nofollow else '' if simple_url_re.match(middle): url = smart_urlquote(middle) elif simple_url_2_re.match(middle): url = smart_urlquote('http://%s' % middle) elif not ':' in middle and simple_email_re.match(middle): local, domain = middle.rsplit('@', 1) try: domain = domain.encode('idna').decode('ascii') except UnicodeError: continue url = 'mailto:%s@%s' % (local, domain) nofollow_attr = '' # Make link. if url: trimmed = trim_url(middle) if autoescape and not safe_input: lead, trail = escape(lead), escape(trail) url, trimmed = escape(url), escape(trimmed) # # Custom stuff for us # lowered = url.lower() is_image = (lowered.endswith('.jpg') or lowered.endswith('.gif') or lowered.endswith('.png')) class_attr = is_image and ' class="image"' or '' middle = '<a href="%s"%s%s>%s</a>' % (url, nofollow_attr, class_attr, trimmed) # # End custom stuff # words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) else: if safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) elif safe_input: words[i] = mark_safe(word) elif autoescape: words[i] = escape(word) return ''.join(words)