def translate_to_html(text): def set_target(attrs, new=False): p = urlparse(attrs[(None, 'href')]) if p.netloc not in ['localhost:8000']: attrs[(None, 'rel')] = 'noopener nofollow' attrs[(None, 'target')] = '_blank' return attrs linker = Linker(callbacks=[set_target]) return linker.linkify( markdown.markdown( get_repo_data( no_html(text) ), extensions=[ 'markdown.extensions.codehilite', 'markdown.extensions.fenced_code', 'markdown.extensions.sane_lists', 'markdown.extensions.tables', 'markdown.extensions.nl2br', ], ) )
def linkify(source): """Render URLs in the string as links.""" def set_attrs(attrs, new=False): attrs[(None, "target")] = "_blank" attrs[(None, "rel")] = "noopener noreferrer" return attrs # Escape all tags linker = Linker(callbacks=[set_attrs]) return linker.linkify(source)
def test_url_re_arg(): """Verifies that a specified url_re is used""" fred_re = re.compile(r"""(fred\.com)""") linker = Linker(url_re=fred_re) assert ( linker.linkify("a b c fred.com d e f") == 'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f') assert (linker.linkify("a b c http://example.com d e f") == "a b c http://example.com d e f")
def test_email_re_arg(): """Verifies that a specified email_re is used""" fred_re = re.compile(r"""(fred@example\.com)""") linker = Linker(parse_email=True, email_re=fred_re) assert ( linker.linkify("a b c [email protected] d e f") == 'a b c <a href="mailto:[email protected]">[email protected]</a> d e f') assert (linker.linkify("a b c [email protected] d e f") == "a b c [email protected] d e f")
def test_url_re_arg(): """Verifies that a specified url_re is used""" fred_re = re.compile(r"""(fred\.com)""") linker = Linker(url_re=fred_re) assert ( linker.linkify('a b c fred.com d e f') == 'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f' ) assert ( linker.linkify('a b c http://example.com d e f') == 'a b c http://example.com d e f' )
def test_email_re_arg(): """Verifies that a specified email_re is used""" fred_re = re.compile(r"""(fred@example\.com)""") linker = Linker(parse_email=True, email_re=fred_re) assert ( linker.linkify('a b c [email protected] d e f') == 'a b c <a href="mailto:[email protected]">[email protected]</a> d e f' ) assert ( linker.linkify('a b c [email protected] d e f') == 'a b c [email protected] d e f' )
def sanitize(html): if not html: return html ret = bleach.clean( html, tags=allowed_tags, attributes=allowed_attrs, styles=allowed_styles, strip=True, ) linker = Linker(recognized_tags=allowed_tags) ret = linker.linkify(ret) return ret
def test_recognized_tags_arg(): """Verifies that recognized_tags works""" # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it linker = Linker(recognized_tags=['p']) assert ( linker.linkify('<p>http://example.com/</p><sarcasm>') == '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm>' # noqa ) # The html parser recognizes "sarcasm" as a tag and fixes it linker = Linker(recognized_tags=['p', 'sarcasm']) assert ( linker.linkify('<p>http://example.com/</p><sarcasm>') == '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>' # noqa )
def rate_blog_comment(comment): result = {"good": {}, "bad": {}} if len(comment.comment) > 500: result["bad"]["length"] = ">500 characters" # Exclude comments that have links in them unless the links are to # www.peterbe.com or songsear.ch. links = [] def find_links(attrs, new=False): href = attrs[(None, u"href")] p = urlparse(href) if p.netloc not in ["www.peterbe.com", "songsear.ch"]: links.append(href) linker = Linker(callbacks=[find_links]) linker.linkify(comment.comment) if links: result["bad"]["links"] = links GOOD_STRINGS = settings.PLOG_GOOD_STRINGS BAD_STRINGS = settings.PLOG_BAD_STRINGS good_strings = [x for x in GOOD_STRINGS if x in comment.comment] maybe_good_strings = [ x for x in GOOD_STRINGS if x.lower() in comment.comment.lower() ] bad_strings = [x for x in BAD_STRINGS if x in comment.comment] maybe_bad_strings = [ x for x in BAD_STRINGS if x.lower() in comment.comment.lower() ] if good_strings: result["good"]["strings"] = good_strings elif maybe_good_strings: result["good"]["maybe_strings"] = maybe_good_strings if bad_strings: result["bad"]["strings"] = bad_strings elif maybe_bad_strings: result["bad"]["maybe_strings"] = maybe_bad_strings return result
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False): """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses linkify does a best-effort approach and tries to recover from bad situations due to crazy text. .. Note:: If you're linking a lot of text and passing the same argument values or you want more configurability, consider using a :py:class:`bleach.linkifier.Linker` instance. .. Note:: If you have text that you want to clean and then linkify, consider using the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean pass. That way you're not parsing the HTML twice. :arg str text: the text to linkify :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :returns: linkified text as unicode """ linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email) return linker.linkify(text)
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False): """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses linkify does a best-effort approach and tries to recover from bad situations due to crazy text. .. Note:: If you're linking a lot of text and passing the same argument values or you want more configurability, consider using a :py:class:`bleach.linkifier.Linker` instance. .. Note:: If you have text that you want to clean and then linkify, consider using the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean pass. That way you're not parsing the HTML twice. :arg str text: the text to linkify :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :returns: linkified text as unicode """ linker = Linker( callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email ) return linker.linkify(text)
def rate_blog_comment(comment): result = {"good": {}, "bad": {}} if len(comment.comment) > 800: result["bad"]["length"] = ">800 characters" # Exclude comments that have links in them unless the links are to # www.peterbe.com or songsear.ch. links = [] def find_links(attrs, new=False): href = attrs[(None, u"href")] p = urlparse(href) if p.netloc not in ["www.peterbe.com", "songsear.ch"]: links.append(href) linker = Linker(callbacks=[find_links]) linker.linkify(comment.comment) if links: result["bad"]["links"] = links GOOD_STRINGS = settings.PLOG_GOOD_STRINGS BAD_STRINGS = settings.PLOG_BAD_STRINGS good_strings = [x for x in GOOD_STRINGS if x in comment.comment] maybe_good_strings = [ x for x in GOOD_STRINGS if x.lower() in comment.comment.lower() ] bad_strings = [x for x in BAD_STRINGS if x in comment.comment] maybe_bad_strings = [x for x in BAD_STRINGS if x.lower() in comment.comment.lower()] if good_strings: result["good"]["strings"] = good_strings elif maybe_good_strings: result["good"]["maybe_strings"] = maybe_good_strings if bad_strings: result["bad"]["strings"] = bad_strings elif maybe_bad_strings: result["bad"]["maybe_strings"] = maybe_bad_strings return result
def test_recognized_tags_arg(): """Verifies that recognized_tags works""" # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it linker = Linker(recognized_tags=["p"]) assert ( linker.linkify("<p>http://example.com/</p><sarcasm>") == '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm>' # noqa ) # The html parser recognizes "sarcasm" as a tag and fixes it linker = Linker(recognized_tags=["p", "sarcasm"]) assert ( linker.linkify("<p>http://example.com/</p><sarcasm>") == '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>' # noqa )
# Removing Attributes def allowed_attrs(attrs, new=False): """Only allow href, target, rel and title.""" allowed = [ (None, 'href'), (None, 'target'), (None, 'rel'), (None, 'title'), '_text', ] return dict((k, v) for k, v in attrs.items() if k in allowed) linker = Linker(callbacks=[allowed_attrs]) html = '<a style="font-weight: super bold;" href="http://example.com">link</a>' print(linker.linkify(html)) # <a href="http://example.com">link</a> print() def remove_title(attrs, new=False): attrs.pop((None, 'title'), None) return attrs linker = Linker(callbacks=[remove_title]) print(linker.linkify('<a href="http://example.com">link</a>')) # <a href="http://example.com">link</a>
def run(self, text): linker = Linker(**self._linker_options) return linker.linkify(text)
import bleach from bleach.linkifier import Linker link1 = bleach.linkify('http://example.com example') print(link1) def set_title(attrs, _): attrs[(None, 'title')] = 'example title' return attrs linker = Linker(callbacks=[set_title]) link2 = linker.linkify('http://example.com example') print(link2) def allowed_attrs(attrs, _): allowed = [ (None, 'href'), (None, 'style'), '_text', ] return dict((k, v) for k, v in attrs.items() if k in allowed) linker = Linker(callbacks=[allowed_attrs]) link3 = linker.linkify( '<a style="font-weight: super bold;" href="http://example.com">example</a>' ) print(link3)
# Altering Attributes def shorten_url(attrs, new=False): """Shorten overly-long URLs in the text.""" # Only adjust newly-created links if not new: return attrs # _text will be the same as the URL for new links text = attrs['_text'] if len(text) > 25: attrs['_text'] = text[:22] + '...' return attrs linker = Linker(callbacks=[shorten_url]) print(linker.linkify('http://example.com/longlonglonglonglongurl')) # <a href="http://example.com/longlonglonglonglongurl">http://example.com/lon...</a> print( linker.linkify( 'abc <a href="http://example.com/longlonglonglonglongurl">http://example.com/longlonglonglonglongurl</a> def' )) # abc <a href="http://example.com/longlonglonglonglongurl">http://example.com/longlonglonglonglongurl</a> def print() def outgoing_bouncer(attrs, new=False): """Send outgoing links through a bouncer.""" href_key = (None, 'href')
def dont_linkify_python(attrs, new=False): # This is an existing link, so leave it be if not new: return attrs # If the TLD is '.py', make sure it starts with http: or https:. # Use _text because that's the original text link_text = attrs['_text'] if link_text.endswith('.py') and not link_text.startswith( ('http:', 'https:')): # This looks like a Python file, not a URL. Don't make a link. return None # Everything checks out, keep going to the next callback. return attrs linker = Linker(callbacks=[dont_linkify_python]) print(linker.linkify('abc http://example.com def')) # abc <a href="http://example.com">http://example.com</a> def print(linker.linkify('abc models.py def')) # abc models.py def print('\n' + '-' * 100 + '\n') linker = Linker(skip_tags=['pre']) print(linker.linkify('a b c http://example.com d e f')) # a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f print(linker.linkify('<pre>http://example.com</pre>')) # <pre>http://example.com</pre>
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' # SOURCE: https://github.com/mozilla/bleach # SOURCE: https://bleach.readthedocs.io/en/latest/linkify.html#removing-links # pip install bleach from bleach.linkifier import Linker # Removing Links def remove_mailto(attrs, new=False): if attrs[(None, 'href')].startswith('mailto:'): return None return attrs linker = Linker(callbacks=[remove_mailto]) html = ''' <a href="mailto:[email protected]">mail janet!</a> abc <a href="http://example.com">http://example.com</a> def '''.strip() print(linker.linkify(html)) # mail janet! # abc <a href="http://example.com">http://example.com</a> def
from six.moves.urllib.parse import urlparse from bleach.linkifier import Linker def set_target(attrs, new=False): p = urlparse(attrs[(None, 'href')]) if p.netloc not in ['lidarts.org']: attrs[(None, 'target')] = '_blank' attrs[(None, 'class')] = 'external' else: attrs.pop((None, 'target'), None) return attrs linker = Linker(callbacks=[set_target])
from bleach.linkifier import Linker from markdown import markdown from .activitypub import get_backend from .webfinger import get_actor_url def _set_attrs(attrs, new=False): attrs[(None, "target")] = "_blank" attrs[(None, "class")] = "external" attrs[(None, "rel")] = "noopener" attrs[(None, "title")] = attrs[(None, "href")] return attrs LINKER = Linker(callbacks=[_set_attrs]) HASHTAG_REGEX = re.compile(r"(#[\d\w\.]+)") MENTION_REGEX = re.compile(r"@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+") def hashtagify(content: str) -> Tuple[str, List[Dict[str, str]]]: base_url = get_backend().base_url() tags = [] for hashtag in re.findall(HASHTAG_REGEX, content): tag = hashtag[1:] link = f'<a href="{base_url}/tags/{tag}" class="mention hashtag" rel="tag">#<span>{tag}</span></a>' tags.append( dict(href=f"{base_url}/tags/{tag}", name=hashtag, type="Hashtag")) content = content.replace(hashtag, link) return content, tags
from utils import * import re, os import numpy as np import shopify from bleach.linkifier import Linker from pyactiveresource.connection import ResourceNotFound from tqdm import tqdm import time yesPar = {'name': 'Participate in Bionet', 'value': 'Yes'} noPar = {'name': 'Participate in Bionet', 'value': 'No'} blankContact = {'name': 'Bionet Contact', 'value': 'NA'} linkify = Linker().linkify import shopifyLimitPatch from pyactiveresource.connection import ResourceNotFound from slack import WebClient from slack.errors import SlackApiError import json slackInfo = json.load(open("slack-token.json", "r")) channel = slackInfo["channel"] heatmap = slackInfo["heatmap"] shutdown = slackInfo["shutdown"] token = slackInfo["token"] client = WebClient(token=token) client.chat_postMessage( channel=channel,
# SOURCE: https://bleach.readthedocs.io/en/latest/linkify.html#callbacks-for-adjusting-attributes-callbacks from urllib.parse import urlparse # pip install bleach from bleach.linkifier import Linker # Setting Attributes def set_title(attrs, new=False): attrs[(None, 'title')] = 'link in user text' return attrs linker = Linker(callbacks=[set_title]) print(linker.linkify('abc http://example.com def')) # abc <a href="http://example.com" title="link in user text">http://example.com</a> def print() def set_target(attrs, new=False): p = urlparse(attrs[(None, 'href')]) if p.netloc not in ['my-domain.com', 'other-domain.com']: attrs[(None, 'target')] = '_blank' attrs[(None, 'class')] = 'external' else: attrs.pop((None, 'target'), None) return attrs
class Command(BaseCommand): help = "PROVISN Telegram Bot" phone_number = TELEGRAM_PHONE_NUMBER api_id = TELEGRAM_API_ID api_hash = TELEGRAM_API_HASH name = TELEGRAM_NAME linker = Linker(callbacks=[set_target]) if DEBUG: phone_number = '4917631653701' api_id = 865426 api_hash = 'cd826b6860816e3dedfc19d61b1eb9bd' name = 'DcPacky' message_amount = 30 client = TelegramClient(name, api_id, api_hash) client.connect() def error(bot, update, error): """Log Errors caused by Updates.""" logger.warning('Update "%s" caused error "%s"', update, error) def get_messages(self, chat, model, tag, *args, **kwargs): with self.client: i = 0 for message in self.client.iter_messages(chat): i += 1 if i == self.message_amount: break try: text = '' + message.text text = self.linker.linkify(text) if 'strip_a' in kwargs: soup = BeautifulSoup(text, 'html.parser') soup.select_one('a').decompose() text = str(soup) text = linebreaksbr(text) if 'strip_a' in kwargs: text = text.replace('[]( ', '') text = text.replace('[](\n', '') text = text.replace('[](', '') text = text.replace('"', '"') text = text.replace('<', '<') text = text.replace('>', '>') title = text.partition('<br>')[0] if '<a' in title: title = '' new_content = model( title=title, text=text, # published_date as in post # published_date=message.date.replace(tzinfo=None) published_date=timezone.now(), ) new_content.save() new_content.tags.add(tag) print("Added!") except IntegrityError: print('Content Already Existing') pass except TypeError: pass def handle(self, *args, **kwargs): logger.info('Start Scraping Messages') #logger.info('Daily HODL') #self.get_messages('thedailyhodl', ContentDailyHodl, 'DailyHODL') #sleep(5) logger.info('Coin Desk') self.get_messages('coindesk_news', ContentCoinDesk, 'CoinDesk') sleep(5) #logger.info('Coin Telegraph') #self.get_messages('cointelegraph', ContentCoinTelegraph, 'CoinTelegraph', strip_a=True) #sleep(5) #logger.info('Bitcoinist') #self.get_messages('bitcoinistnews', ContentBitcoinist, 'Bitcoinist') logger.info("Done Scraping Messages")
trim_urls, week_end as u_week_end, week_start as u_week_start, ) from standup.mdext.nixheaders import NixHeaderExtension BUG_RE = re.compile(r'(bug #?(\d+))', flags=re.I) PULL_RE = re.compile(r'((?:pull|pr) #?(\d+))', flags=re.I) ISSUE_RE = re.compile(r'(issue #?(\d+))', flags=re.I) USER_RE = re.compile(r'(?<=^|(?<=[^\w\-.]))@([\w-]+)', flags=re.I) TAG_RE = re.compile(r'(?:^|[^\w\\/])#([a-z][a-z0-9_.-]*)(?:\b|$)', flags=re.I) MD = Markdown(output_format='html5', extensions=[NixHeaderExtension(), 'nl2br', 'smart_strong']) CLEANER = Cleaner(tags=[]) LINKER = Linker(callbacks=[trim_urls, nofollow]) class StandupUser(models.Model): """A standup participant--tied to Django's User model.""" # Note: User provides "username", "is_superuser", "is_staff" and "email" user = models.OneToOneField(User, on_delete=models.CASCADE, related_name='profile') name = models.CharField(max_length=100, blank=True, null=True) slug = models.SlugField(max_length=100, blank=True, null=True, unique=True) irc_nick = models.CharField(max_length=100, blank=True, null=True, unique=True, help_text='IRC nick for this particular user')
def client_handler(websocket, path): # user sends their identity on connect (rachni.js#L131) connect_data = yield from websocket.recv() connect_message = json.loads(connect_data) if debug: print('New client: ', websocket, ' (', connect_message["user"], ')') print('connect_message: ', connect_message) log('New client: ' + str(websocket) + '(' + connect_message["user"] + ')\n') welcome_message = { "message": "Welcome to " + connect_message["channel_name"] + ".", "timestamp": connect_message["timestamp"], "user": "******", "channel": connect_message["channel"], "channel_name": connect_message["channel_name"], "type": "SYSTEM" } if connect_message["channel"] in channel_list: if connect_message["user"] in channel_list[connect_message["channel"]]: channel_list[connect_message["channel"]][ connect_message['user']].append(websocket) else: channel_list[connect_message["channel"]][ connect_message['user']] = [] channel_list[connect_message["channel"]][ connect_message['user']].append(websocket) else: channel_list[connect_message["channel"]] = {} channel_list[connect_message["channel"]][connect_message['user']] = [] channel_list[connect_message["channel"]][ connect_message['user']].append(websocket) # check to see if maximum session limit has been reached if len(channel_list[connect_message["channel"]][ connect_message["user"]]) > session_limit: if debug: print('Maximum connection limit reached!') maxlimit_message = { "message": "Maximum connection limit reached!", "timestamp": connect_message["timestamp"], "user": "******", "channel": connect_message["channel"], "channel_name": connect_message["channel_name"], "type": "SYSTEM" } yield from websocket.send(json.dumps(maxlimit_message)) channel_list[connect_message["channel"]][ connect_message["user"]].remove(websocket) websocket.close(code=1000, reason='Connection limit reached!') return else: user_count = len(channel_list[connect_message["channel"]]) - 1 user_sync_message = user_sync(connect_message) join_message = { "message": "There are " + str(user_count) + " other users connected.", "timestamp": connect_message["timestamp"], "user": "******", "channel": connect_message["channel"], "channel_name": connect_message["channel_name"], "type": "SYSTEM" } if debug: print('channel_list: ', channel_list) yield from websocket.send(json.dumps(welcome_message)) yield from websocket.send(json.dumps(join_message)) for user in channel_list[connect_message["channel"]]: if debug: print('user (connect_message): ', user) for socket in channel_list[connect_message["channel"]][user]: if debug: print('socket: ', socket) yield from socket.send(json.dumps(connect_message)) yield from socket.send(json.dumps(user_sync_message)) # wait for messages try: while True: message_data = yield from websocket.recv() message_json = json.loads(message_data) if len(message_json['message'].strip()) is 0: if debug: print('Blank message detected! Not sent to clients.') continue # set up callback for _blank target linker = Linker(callbacks=[target_blank]) # sanitize our input, then convert links to actual links message_json['message'] = bleach.clean(message_json['message']) message_json['message'] = linker.linkify( message_json['message']) if debug: print('message: ', message_json) # send message only to users in the same channel for user in channel_list[connect_message["channel"]]: if debug: print('user (message): ', user) for socket in channel_list[ connect_message["channel"]][user]: yield from socket.send(json.dumps(message_json)) # probably a better way to handle disconnections, but this works except websockets.exceptions.ConnectionClosed: part_message = { "message": connect_message["user"] + " has left.", "timestamp": connect_message["timestamp"], "user": "******", "channel": connect_message["channel"], "channel_name": connect_message["channel_name"], "type": "SYSTEM" } channel_list[connect_message["channel"]][ connect_message["user"]].remove(websocket) # remove the user from the list if they have no socket connections open if len(channel_list[connect_message["channel"]][ connect_message["user"]]) == 0: del channel_list[connect_message["channel"]][ connect_message["user"]] user_sync_message = user_sync(connect_message) if debug: print('Client closed connection', websocket) log('Client closed connection: ' + str(websocket) + '\n') for user in channel_list[connect_message["channel"]]: if debug: print('user (disconnect): ', user) for socket in channel_list[connect_message["channel"]][user]: yield from socket.send(json.dumps(part_message)) yield from socket.send(json.dumps(user_sync_message))