def __init__(self, bot, run_later): if 'safebrowsingapi' in bot.config['main']: self.safeBrowsingAPI = SafeBrowsingAPI( bot.config['main']['safebrowsingapi'], bot.nickname, bot.version) else: self.safeBrowsingAPI = None try: self.sqlconn = pymysql.connect( unix_socket=bot.config['sql']['unix_socket'], user=bot.config['sql']['user'], passwd=bot.config['sql']['passwd'], charset='utf8') self.sqlconn.select_db(bot.config['sql']['db']) except pymysql.err.OperationalError as e: error_code, error_message = e.args if error_code == 1045: log.error( 'Access denied to database with user \'{0}\'. Review your config file.' .format(bot.config['sql']['user'])) elif error_code == 1044: log.error( 'Access denied to database \'{0}\' with user \'{1}\'. Make sure the database \'{0}\' exists and user \'{1}\' has full access to it.' .format(bot.config['sql']['db'], bot.config['sql']['user'])) else: log.error(e) sys.exit(1) self.sqlconn.autocommit(True) self.regex = re.compile( r'((http:\/\/)|\b)(\w|\.)*\.(((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2})\/\S*)|((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2}))\b)', re.IGNORECASE) self.run_later = run_later self.cache = LinkCheckerCache( ) # cache[url] = True means url is safe, False means the link is bad return
def __init__(self, bot, run_later): if "safebrowsingapi" in bot.config["main"]: self.safeBrowsingAPI = SafeBrowsingAPI(bot.config["main"]["safebrowsingapi"], bot.nickname, bot.version) else: self.safeBrowsingAPI = None try: self.sqlconn = pymysql.connect( unix_socket=bot.config["sql"]["unix_socket"], user=bot.config["sql"]["user"], passwd=bot.config["sql"]["passwd"], charset="utf8", ) self.sqlconn.select_db(bot.config["sql"]["db"]) except pymysql.err.OperationalError as e: error_code, error_message = e.args if error_code == 1045: log.error( "Access denied to database with user '{0}'. Review your config file.".format( bot.config["sql"]["user"] ) ) elif error_code == 1044: log.error( "Access denied to database '{0}' with user '{1}'. Make sure the database '{0}' exists and user '{1}' has full access to it.".format( bot.config["sql"]["db"], bot.config["sql"]["user"] ) ) else: log.error(e) sys.exit(1) self.sqlconn.autocommit(True) self.regex = re.compile( r"((http:\/\/)|\b)(\w|\.)*\.(((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2})\/\S*)|((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2}))\b)", re.IGNORECASE, ) self.run_later = run_later self.cache = LinkCheckerCache() # cache[url] = True means url is safe, False means the link is bad return
class LinkChecker: def __init__(self, bot, run_later): if 'safebrowsingapi' in bot.config['main']: self.safeBrowsingAPI = SafeBrowsingAPI( bot.config['main']['safebrowsingapi'], bot.nickname, bot.version) else: self.safeBrowsingAPI = None try: self.sqlconn = pymysql.connect( unix_socket=bot.config['sql']['unix_socket'], user=bot.config['sql']['user'], passwd=bot.config['sql']['passwd'], charset='utf8') self.sqlconn.select_db(bot.config['sql']['db']) except pymysql.err.OperationalError as e: error_code, error_message = e.args if error_code == 1045: log.error( 'Access denied to database with user \'{0}\'. Review your config file.' .format(bot.config['sql']['user'])) elif error_code == 1044: log.error( 'Access denied to database \'{0}\' with user \'{1}\'. Make sure the database \'{0}\' exists and user \'{1}\' has full access to it.' .format(bot.config['sql']['db'], bot.config['sql']['user'])) else: log.error(e) sys.exit(1) self.sqlconn.autocommit(True) self.regex = re.compile( r'((http:\/\/)|\b)(\w|\.)*\.(((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2})\/\S*)|((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2}))\b)', re.IGNORECASE) self.run_later = run_later self.cache = LinkCheckerCache( ) # cache[url] = True means url is safe, False means the link is bad return def delete_from_cache(self, url): if url in self.cache: log.debug("LinkChecker: Removing url {0} from cache".format(url)) del self.cache[url] def cache_url(self, url, safe): if url in self.cache and self.cache[url] == safe: return log.debug("LinkChecker: Caching url {0}".format(url)) self.cache[url] = safe self.run_later(20, self.delete_from_cache, (url, )) def counteract_bad_url(self, url, action=None, want_to_cache=True, want_to_blacklist=True): log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url)) if action: action.run() if want_to_cache: self.cache_url(url.url, False) if want_to_blacklist: self.blacklist_url(url.url, url.parsed) def unlist_url(self, url, list_type, parsed_url=None): """ list_type is either 'blacklist' or 'whitelist' """ if not (url.startswith('http://') or url.startswith('https://')): url = 'http://' + url if parsed_url is None: parsed_url = urllib.parse.urlparse(url) self.sqlconn.ping() cursor = self.sqlconn.cursor() domain = parsed_url.netloc path = parsed_url.path if domain.startswith('www.'): domain = domain[4:] if path.endswith('/'): path = path[:-1] if path == '': path = '/' cursor.execute( "DELETE FROM `tb_link_" + list_type + "` WHERE `domain`=%s AND `path`=%s", (domain, path)) def blacklist_url(self, url, parsed_url=None, level=1): if not (url.lower().startswith('http://') or url.lower().startswith('https://')): url = 'http://' + url if parsed_url is None: parsed_url = urllib.parse.urlparse(url) if self.is_blacklisted(url, parsed_url): return self.sqlconn.ping() cursor = self.sqlconn.cursor() domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if domain.startswith('www.'): domain = domain[4:] if path.endswith('/'): path = path[:-1] if path == '': path = '/' cursor.execute("INSERT INTO `tb_link_blacklist` VALUES(%s, %s, %s)", (domain, path, level)) def whitelist_url(self, url, parsed_url=None): if not (url.lower().startswith('http://') or url.lower().startswith('https://')): url = 'http://' + url if parsed_url is None: parsed_url = urllib.parse.urlparse(url) if self.is_whitelisted(url, parsed_url): return self.sqlconn.ping() cursor = self.sqlconn.cursor() domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if domain.startswith('www.'): domain = domain[4:] if path.endswith('/'): path = path[:-1] if path == '': path = '/' cursor.execute("INSERT INTO `tb_link_whitelist` VALUES(%s, %s)", (domain, path)) def is_blacklisted(self, url, parsed_url=None, sublink=False): self.sqlconn.ping() cursor = self.sqlconn.cursor(pymysql.cursors.DictCursor) if parsed_url is None: parsed_url = urllib.parse.urlparse(url) domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if path == '': path = '/' domain_split = domain.split('.') if len(domain_split) < 2: return False domain_tail = domain_split[-2] + '.' + domain_split[-1] cursor.execute( "SELECT * FROM `tb_link_blacklist` WHERE `domain` LIKE %s OR `domain`=%s", ('%' + domain_tail, domain)) for row in cursor: if is_subdomain(domain, row['domain']): if is_subpath(path, row['path']): if not sublink: return True elif row['level'] >= 1: # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted return True return False def is_whitelisted(self, url, parsed_url=None): self.sqlconn.ping() cursor = self.sqlconn.cursor(pymysql.cursors.DictCursor) if parsed_url is None: parsed_url = urllib.parse.urlparse(url) domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if path == '': path = '/' domain_split = domain.split('.') if len(domain_split) < 2: return domain_tail = domain_split[-2] + '.' + domain_split[-1] cursor.execute( "SELECT * FROM `tb_link_whitelist` WHERE `domain` LIKE %s OR `domain`=%s", ('%' + domain_tail, domain)) for row in cursor: if is_subdomain(domain, row['domain']): if is_subpath(path, row['path']): return True return False def basic_check(self, url, action, sublink=False): """ Return values: 1 = Link is OK -1 = Link is bad 0 = Link needs further analysis """ if url.url in self.cache: log.debug("LinkChecker: Url {0} found in cache".format(url.url)) if not self.cache[url.url]: # link is bad self.counteract_bad_url(url, action, False, False) return -1 return 1 if self.is_whitelisted(url.url, url.parsed): log.debug("LinkChecker: Url {0} allowed by the whitelist".format( url.url)) self.cache_url(url.url, True) return 1 if self.is_blacklisted(url.url, url.parsed, sublink): log.debug("LinkChecker: Url {0} is blacklisted".format(url.url)) self.counteract_bad_url(url, action, want_to_blacklist=False) return -1 return 0 def check_url(self, url, action): url = Url(url) if len(url.parsed.netloc.split('.')) < 2: # The URL is broken, ignore it return try: self._check_url(url, action) except: log.exception("LinkChecker unhanled exception while _check_url") def _check_url(self, url, action): self.sqlconn.ping() log.debug("LinkChecker: Checking url {0}".format(url.url)) if self.basic_check(url, action): return connection_timeout = 2 read_timeout = 1 try: r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout) except: self.cache_url(url.url, True) return checkcontenttype = ('content-type' in r.headers and r.headers['content-type'] == 'application/octet-stream') checkdispotype = ('disposition-type' in r.headers and r.headers['disposition-type'] == 'attachment') if checkcontenttype or checkdispotype: # triggering a download not allowed self.counteract_bad_url(url, action) return redirected_url = Url(r.url) if not is_same_url(url, redirected_url): if self.basic_check(redirected_url, action): return if self.safeBrowsingAPI: if self.safeBrowsingAPI.check_url( redirected_url.url): # harmful url detected log.debug("Bad url because google api") self.counteract_bad_url(url, action) self.counteract_bad_url(redirected_url) return if 'content-type' not in r.headers or not r.headers[ 'content-type'].startswith('text/html'): return # can't analyze non-html content maximum_size = 1024 * 1024 * 10 # 10 MB receive_timeout = 3 html = '' try: response = requests.get(url=url.url, stream=True, timeout=(connection_timeout, read_timeout)) content_length = response.headers.get('Content-Length') if content_length and int( response.headers.get('Content-Length')) > maximum_size: log.error('This file is too big!') return size = 0 start = time.time() for chunk in response.iter_content(1024): if time.time() - start > receive_timeout: log.error('The site took too long to load') return size += len(chunk) if size > maximum_size: log.error('This file is too big! (fake header)') return html += str(chunk) except requests.exceptions.ConnectTimeout: log.error('Connection timed out while checking {0}'.format( url.url)) self.cache_url(url.url, True) return except requests.exceptions.ReadTimeout: log.error('Reading timed out while checking {0}'.format(url.url)) self.cache_url(url.url, True) return except: log.exception('Unhandled exception') return try: soup = BeautifulSoup(html, 'html.parser') except: return original_url = url original_redirected_url = redirected_url urls = [] for link in soup.find_all( 'a'): # get a list of links to external sites url = link.get('href') if url is None: continue if url.startswith('//'): urls.append('http:' + url) elif url.startswith('http://') or url.startswith('https://'): urls.append(url) for url in urls: # check if the site links to anything dangerous url = Url(url) if is_subdomain(url.parsed.netloc, original_url.parsed.netloc): # log.debug("Skipping because internal link") continue log.debug("Checking sublink {0}".format(url.url)) res = self.basic_check(url, action, sublink=True) if res == -1: self.counteract_bad_url(url) self.counteract_bad_url(original_url, want_to_blacklist=False) self.counteract_bad_url(original_redirected_url, want_to_blacklist=False) return elif res == 1: continue try: r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout) except: continue redirected_url = Url(r.url) if not is_same_url(url, redirected_url): res = self.basic_check(redirected_url, action, sublink=True) if res == -1: self.counteract_bad_url(url) self.counteract_bad_url(original_url, want_to_blacklist=False) self.counteract_bad_url(original_redirected_url, want_to_blacklist=False) return elif res == 1: continue if self.safeBrowsingAPI: if self.safeBrowsingAPI.check_url( redirected_url.url): # harmful url detected log.debug("Evil sublink {0} by google API".format(url)) self.counteract_bad_url(original_url, action) self.counteract_bad_url(original_redirected_url) self.counteract_bad_url(url) self.counteract_bad_url(redirected_url) return # if we got here, the site is clean for our standards self.cache_url(original_url.url, True) self.cache_url(original_redirected_url.url, True) return def find_urls_in_message(self, msg_raw): _urls = self.regex.finditer(msg_raw) urls = [] for i in _urls: url = i.group(0) if not (url.startswith('http://') or url.startswith('https://')): url = 'http://' + url if not (url[-1].isalpha() or url[-1].isnumeric() or url[-1] == '/'): url = url[:-1] urls.append(url) return set(urls)
class LinkChecker: def __init__(self, bot, run_later): if "safebrowsingapi" in bot.config["main"]: self.safeBrowsingAPI = SafeBrowsingAPI(bot.config["main"]["safebrowsingapi"], bot.nickname, bot.version) else: self.safeBrowsingAPI = None try: self.sqlconn = pymysql.connect( unix_socket=bot.config["sql"]["unix_socket"], user=bot.config["sql"]["user"], passwd=bot.config["sql"]["passwd"], charset="utf8", ) self.sqlconn.select_db(bot.config["sql"]["db"]) except pymysql.err.OperationalError as e: error_code, error_message = e.args if error_code == 1045: log.error( "Access denied to database with user '{0}'. Review your config file.".format( bot.config["sql"]["user"] ) ) elif error_code == 1044: log.error( "Access denied to database '{0}' with user '{1}'. Make sure the database '{0}' exists and user '{1}' has full access to it.".format( bot.config["sql"]["db"], bot.config["sql"]["user"] ) ) else: log.error(e) sys.exit(1) self.sqlconn.autocommit(True) self.regex = re.compile( r"((http:\/\/)|\b)(\w|\.)*\.(((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2})\/\S*)|((aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|[a-zA-Z]{2}))\b)", re.IGNORECASE, ) self.run_later = run_later self.cache = LinkCheckerCache() # cache[url] = True means url is safe, False means the link is bad return def delete_from_cache(self, url): if url in self.cache: log.debug("LinkChecker: Removing url {0} from cache".format(url)) del self.cache[url] def cache_url(self, url, safe): if url in self.cache and self.cache[url] == safe: return log.debug("LinkChecker: Caching url {0}".format(url)) self.cache[url] = safe self.run_later(20, self.delete_from_cache, (url,)) def counteract_bad_url(self, url, action=None, want_to_cache=True, want_to_blacklist=True): log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url)) if action: action.run() if want_to_cache: self.cache_url(url.url, False) if want_to_blacklist: self.blacklist_url(url.url, url.parsed) def unlist_url(self, url, list_type, parsed_url=None): """ list_type is either 'blacklist' or 'whitelist' """ if not (url.startswith("http://") or url.startswith("https://")): url = "http://" + url if parsed_url is None: parsed_url = urllib.parse.urlparse(url) self.sqlconn.ping() cursor = self.sqlconn.cursor() domain = parsed_url.netloc path = parsed_url.path if domain.startswith("www."): domain = domain[4:] if path.endswith("/"): path = path[:-1] if path == "": path = "/" cursor.execute("DELETE FROM `tb_link_" + list_type + "` WHERE `domain`=%s AND `path`=%s", (domain, path)) def blacklist_url(self, url, parsed_url=None, level=1): if not (url.lower().startswith("http://") or url.lower().startswith("https://")): url = "http://" + url if parsed_url is None: parsed_url = urllib.parse.urlparse(url) if self.is_blacklisted(url, parsed_url): return self.sqlconn.ping() cursor = self.sqlconn.cursor() domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if domain.startswith("www."): domain = domain[4:] if path.endswith("/"): path = path[:-1] if path == "": path = "/" cursor.execute("INSERT INTO `tb_link_blacklist` VALUES(%s, %s, %s)", (domain, path, level)) def whitelist_url(self, url, parsed_url=None): if not (url.lower().startswith("http://") or url.lower().startswith("https://")): url = "http://" + url if parsed_url is None: parsed_url = urllib.parse.urlparse(url) if self.is_whitelisted(url, parsed_url): return self.sqlconn.ping() cursor = self.sqlconn.cursor() domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if domain.startswith("www."): domain = domain[4:] if path.endswith("/"): path = path[:-1] if path == "": path = "/" cursor.execute("INSERT INTO `tb_link_whitelist` VALUES(%s, %s)", (domain, path)) def is_blacklisted(self, url, parsed_url=None, sublink=False): self.sqlconn.ping() cursor = self.sqlconn.cursor(pymysql.cursors.DictCursor) if parsed_url is None: parsed_url = urllib.parse.urlparse(url) domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if path == "": path = "/" domain_split = domain.split(".") if len(domain_split) < 2: return False domain_tail = domain_split[-2] + "." + domain_split[-1] cursor.execute( "SELECT * FROM `tb_link_blacklist` WHERE `domain` LIKE %s OR `domain`=%s", ("%" + domain_tail, domain) ) for row in cursor: if is_subdomain(domain, row["domain"]): if is_subpath(path, row["path"]): if not sublink: return True elif ( row["level"] >= 1 ): # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted return True return False def is_whitelisted(self, url, parsed_url=None): self.sqlconn.ping() cursor = self.sqlconn.cursor(pymysql.cursors.DictCursor) if parsed_url is None: parsed_url = urllib.parse.urlparse(url) domain = parsed_url.netloc.lower() path = parsed_url.path.lower() if path == "": path = "/" domain_split = domain.split(".") if len(domain_split) < 2: return domain_tail = domain_split[-2] + "." + domain_split[-1] cursor.execute( "SELECT * FROM `tb_link_whitelist` WHERE `domain` LIKE %s OR `domain`=%s", ("%" + domain_tail, domain) ) for row in cursor: if is_subdomain(domain, row["domain"]): if is_subpath(path, row["path"]): return True return False def basic_check(self, url, action, sublink=False): """ Return values: 1 = Link is OK -1 = Link is bad 0 = Link needs further analysis """ if url.url in self.cache: log.debug("LinkChecker: Url {0} found in cache".format(url.url)) if not self.cache[url.url]: # link is bad self.counteract_bad_url(url, action, False, False) return -1 return 1 if self.is_whitelisted(url.url, url.parsed): log.debug("LinkChecker: Url {0} allowed by the whitelist".format(url.url)) self.cache_url(url.url, True) return 1 if self.is_blacklisted(url.url, url.parsed, sublink): log.debug("LinkChecker: Url {0} is blacklisted".format(url.url)) self.counteract_bad_url(url, action, want_to_blacklist=False) return -1 return 0 def check_url(self, url, action): url = Url(url) if len(url.parsed.netloc.split(".")) < 2: # The URL is broken, ignore it return try: self._check_url(url, action) except: log.exception("LinkChecker unhanled exception while _check_url") def _check_url(self, url, action): self.sqlconn.ping() log.debug("LinkChecker: Checking url {0}".format(url.url)) if self.basic_check(url, action): return connection_timeout = 2 read_timeout = 1 try: r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout) except: self.cache_url(url.url, True) return checkcontenttype = "content-type" in r.headers and r.headers["content-type"] == "application/octet-stream" checkdispotype = "disposition-type" in r.headers and r.headers["disposition-type"] == "attachment" if checkcontenttype or checkdispotype: # triggering a download not allowed self.counteract_bad_url(url, action) return redirected_url = Url(r.url) if not is_same_url(url, redirected_url): if self.basic_check(redirected_url, action): return if self.safeBrowsingAPI: if self.safeBrowsingAPI.check_url(redirected_url.url): # harmful url detected log.debug("Bad url because google api") self.counteract_bad_url(url, action) self.counteract_bad_url(redirected_url) return if "content-type" not in r.headers or not r.headers["content-type"].startswith("text/html"): return # can't analyze non-html content maximum_size = 1024 * 1024 * 10 # 10 MB receive_timeout = 3 html = "" try: response = requests.get(url=url.url, stream=True, timeout=(connection_timeout, read_timeout)) content_length = response.headers.get("Content-Length") if content_length and int(response.headers.get("Content-Length")) > maximum_size: log.error("This file is too big!") return size = 0 start = time.time() for chunk in response.iter_content(1024): if time.time() - start > receive_timeout: log.error("The site took too long to load") return size += len(chunk) if size > maximum_size: log.error("This file is too big! (fake header)") return html += str(chunk) except requests.exceptions.ConnectTimeout: log.error("Connection timed out while checking {0}".format(url.url)) self.cache_url(url.url, True) return except requests.exceptions.ReadTimeout: log.error("Reading timed out while checking {0}".format(url.url)) self.cache_url(url.url, True) return except: log.exception("Unhandled exception") return try: soup = BeautifulSoup(html, "html.parser") except: return original_url = url original_redirected_url = redirected_url urls = [] for link in soup.find_all("a"): # get a list of links to external sites url = link.get("href") if url is None: continue if url.startswith("//"): urls.append("http:" + url) elif url.startswith("http://") or url.startswith("https://"): urls.append(url) for url in urls: # check if the site links to anything dangerous url = Url(url) if is_subdomain(url.parsed.netloc, original_url.parsed.netloc): # log.debug("Skipping because internal link") continue log.debug("Checking sublink {0}".format(url.url)) res = self.basic_check(url, action, sublink=True) if res == -1: self.counteract_bad_url(url) self.counteract_bad_url(original_url, want_to_blacklist=False) self.counteract_bad_url(original_redirected_url, want_to_blacklist=False) return elif res == 1: continue try: r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout) except: continue redirected_url = Url(r.url) if not is_same_url(url, redirected_url): res = self.basic_check(redirected_url, action, sublink=True) if res == -1: self.counteract_bad_url(url) self.counteract_bad_url(original_url, want_to_blacklist=False) self.counteract_bad_url(original_redirected_url, want_to_blacklist=False) return elif res == 1: continue if self.safeBrowsingAPI: if self.safeBrowsingAPI.check_url(redirected_url.url): # harmful url detected log.debug("Evil sublink {0} by google API".format(url)) self.counteract_bad_url(original_url, action) self.counteract_bad_url(original_redirected_url) self.counteract_bad_url(url) self.counteract_bad_url(redirected_url) return # if we got here, the site is clean for our standards self.cache_url(original_url.url, True) self.cache_url(original_redirected_url.url, True) return def find_urls_in_message(self, msg_raw): _urls = self.regex.finditer(msg_raw) urls = [] for i in _urls: url = i.group(0) if not (url.startswith("http://") or url.startswith("https://")): url = "http://" + url if not (url[-1].isalpha() or url[-1].isnumeric() or url[-1] == "/"): url = url[:-1] urls.append(url) return set(urls)