def get_urls(body) -> List[str]: # Set up an extractor to extract the URLs extractor = URLExtract() # Update the TLD list if it is older than x days extractor.update_when_older(7) # Run the extractor and remove any duplicates urls = extractor.find_urls(body, only_unique=True) return urls
def __extract_domain_from_sent_field(self, sent: str) -> str: """ Get the url out of a 'sent' field in a measurement. Parameters ---------- sent: str Examples: * An empty string ("") meaning the sent packet wasn't recorded. * "GET / HTTP/1.1\r\nHost: example5718349450314.com\r\n" (echo/discard) * "GET www.bbc.co.uk HTTP/1.1\r\nHost: /content.html\r\n" (discard error) or just "www.apple.com" (HTTP/S) Returns ------- str Just the url, if found. """ extractor = URLExtract() extractor.update_when_older( 7) # updates known TLD when list is older that 7 days if sent == '': return sent match = re.search(QuackConstants.SENT_PATTERN.value, sent) if match: path = match.group(1) domain = match.group(2) # This is a bug where the domain and path were reversed in content sent. # We do our best to reconstruct the intended url # by swapping them to their intended position if extractor.has_urls(path): domain, path = path, domain if path == '/': return domain return domain + path if ' ' not in sent: return sent raise Exception(f"unknown sent field format: {sent}")
def incoming(update, context): '''Check incoming stream for urls and put attempted bypasses on them if they are in the list of domains that need it''' extractor = URLExtract() extractor.update_when_older( 7) # gets the latest list of TLDs from iana.org every 7 days urls = extractor.find_urls(update.effective_message.text, check_dns=True) active_dict = context.chat_data.get( 'active domains', {} ) # this s/could have been a set instead. stuck as dict for legacy reasons for url in urls: if get_domain(url) not in active_dict: continue context.bot.send_chat_action(chat_id=update.effective_message.chat_id, action=ChatAction.TYPING) text = add_bypass(url, context=context) say(text, update, context) if len(urls) == 1: context.chat_data['last url'] = urls[0] url_bookkeeping(context)
import pajbot.utils from pajbot.apiwrappers.safebrowsing import SafeBrowsingAPI from pajbot.managers.adminlog import AdminLogManager from pajbot.managers.db import Base from pajbot.managers.db import DBManager from pajbot.managers.handler import HandlerManager from pajbot.models.command import Command from pajbot.models.command import CommandExample from pajbot.modules import BaseModule from pajbot.modules import ModuleSetting log = logging.getLogger(__name__) extractor = URLExtract() extractor.update_when_older(14) def is_subdomain(x, y): """Returns True if x is a subdomain of y, otherwise return False. Example: is_subdomain('pajlada.se', 'pajlada.se') = True is_subdomain('test.pajlada.se', 'pajlada.se') = True is_subdomain('test.pajlada.se', 'pajlada.com') = False """ if y.startswith("www."): y = y[4:] return x.endswith("." + y) or x == y
class EmailHandler(object): def __init__(self, config): self.config = config self.region = self.config.get("mud", {}).get("region", "us-east-1") email_config = self.config.get("email", {}) self.endpoint = email_config.get("endpoint", None) self.use_ssl = email_config.get("useSsl", True) self.session = boto3.session.Session(region_name=self.region) self.ses = self.session.client('ses', endpoint_url=self.endpoint, use_ssl=self.use_ssl) self.mocked = email_config.get("mocked", False) self.extractor = URLExtract() self.extractor.update = self._update_tlds self.extractor_lock = Lock() self.url_re = re.compile(r'^(?P<type>.*://)?(?P<remainder>.*)$') self.html2text = HTML2Text() self.html2text.ignore_links = False self.html2text.ignore_images = False self.html2text.ignore_tables = False self.html2text.ignore_emphasis = False self.html2text_lock = Lock() self.in_channel = stackless.channel() stackless.tasklet(self._send_email_loop)() def _update_tlds(self): response = requests.get( "https://data.iana.org/TLD/tlds-alpha-by-domain.txt") data = response.content # noinspection PyProtectedMember filename = self.extractor._get_cache_file_path(None) with open(filename, "wb") as f: f.write(data) # noinspection PyProtectedMember self.extractor._reload_tlds_from_file() return True def send_email(self, from_, to, subject, body_html=None, body_text=None): request = EmailRequest(from_, to, subject, body_html, body_text) self.in_channel.send(request) response = request.channel.receive() return response def _send_email_loop(self): while True: request = self.in_channel.receive() stackless.tasklet(self._send_email_tasklet)(request) @staticmethod def _log_email(request, message, level="info"): this_logger = getattr(logger, level, logger.debug) # First the global copy this_logger(message) # Then send it to the accounts involved for account in request.accounts: this_logger(AccountLogMessage(account, message)) def _send_email_tasklet(self, request): response = {} try: self._log_email( request, "Sending email from %s to %s" % (request.from_, request.to)) if not request.body_html: if not request.body_text: raise Exception("No body given") request.body_html = self.htmlize_body(request.body_text) if not request.body_text: request.body_text = self.unhtmlize_body(request.body_html) email = { "Destination": { "ToAddresses": [ request.to, ], }, "Source": request.from_, "Message": { "Body": { "Html": { "Charset": "utf-8", "Data": request.body_html, }, "Text": { "Charset": "utf-8", "Data": request.body_text, }, }, "Subject": { "Charset": "utf-8", "Data": request.subject, }, }, "ConfigurationSetName": "havokmud", } # This is necessary for SES to send the email # When using localstack, we "verify" as we go # In real life, the domain should be verified before using this, preferrably # by using Easy DKIM. if self.mocked: self.ses.verify_email_identity(EmailAddress=request.from_) response = self.ses.send_email(**email) self._log_email( request, "Email sent: MessageID: %s" % response.get("MessageId", None)) except ClientError as e: # Display an error if something goes wrong. self._log_email( request, "Error sending email: %s" % e.response.get("Error", {}).get("Message", "unknown error"), level="error") response = e.response['Error']['Message'] except Exception as e: self._log_email(request, "Error sending email: %s" % str(e), level="exception") response = {"Error": {"Message": str(e)}} request.channel.send(response) def htmlize_body(self, text): # Want to put links on URLs, and make the rest html-safe with self.extractor_lock: self.extractor.update_when_older(7) _input = text output = b"<html><body>" while _input: urls = self.extractor.find_urls(_input) if urls: url = urls.pop(0) start_index = _input.index(url) end_index = start_index + len(url) output += self._escape(_input[:start_index]) match = self.url_re.match(url) if not match: output += self._escape(url) else: if not match.group("type"): url = "http://%s" % url output += b'<a href="%s">%s</a>' % (url, url) _input = _input[end_index:] else: output += self._escape(_input) _input = None output += b"</body></html>" return output.decode("utf-8") @staticmethod def _escape(text): return html.escape(text).encode("ascii", "xmlcharrefreplace") def unhtmlize_body(self, html_): # Strip this down to Markdown text with self.html2text_lock: return self.html2text.handle(html_)