Example #1
0
    def enable(self, bot):
        self.bot = bot
        pajbot.managers.handler.HandlerManager.add_handler('on_message',
                                                           self.on_message,
                                                           priority=100)
        pajbot.managers.handler.HandlerManager.add_handler(
            'on_commit', self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config['main']['safebrowsingapi'], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)
Example #2
0
    def enable(self, bot):
        self.bot = bot
        if bot:
            bot.add_handler('on_message', self.on_message, priority=100)
            bot.add_handler('on_commit', self.on_commit)
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(bot.config['main']['safebrowsingapi'], bot.nickname, bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)
Example #3
0
class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    SETTINGS = []

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache()  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        if bot:
            bot.add_handler('on_message', self.on_message, priority=100)
            bot.add_handler('on_commit', self.on_commit)
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(bot.config['main']['safebrowsingapi'], bot.nickname, bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        if bot:
            bot.remove_handler('on_message', self.on_message)
            bot.remove_handler('on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    def on_message(self, source, message, emotes, whisper, urls):
        if not whisper and source.level < 500 and source.moderator is False:
            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout, args=[source.username, 20])
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug("LinkChecker: Removing url {0} from cache".format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug("LinkChecker: Caching url {0} as {1}".format(url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self, url, action=None, want_to_cache=True, want_to_blacklist=True):
        log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)

    def unlist_url(self, url, list_type, parsed_url=None):
        """ list_type is either 'blacklist' or 'whitelist' """
        if not (url.startswith('http://') or url.startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        domain = parsed_url.netloc
        path = parsed_url.path

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        if list_type == 'blacklist':
            link = self.db_session.query(BlacklistedLink).filter_by(domain=domain, path=path).one_or_none()
            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))
        elif list_type == 'whitelist':
            link = self.db_session.query(WhitelistedLink).filter_by(domain=domain, path=path).one_or_none()
            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))

    def blacklist_url(self, url, parsed_url=None, level=1):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        return True

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug("LinkChecker: Url {0} found in cache".format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug("LinkChecker: Url {0} is blacklisted".format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug("LinkChecker: Url {0} allowed by the whitelist".format(url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception("LinkChecker unhanled exception while _check_url")

    def _check_url(self, url, action):
        log.debug("LinkChecker: Checking url {0}".format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers and r.headers['content-type'] == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                log.debug("Bad url because google api")
                self.counteract_bad_url(url, action)
                self.counteract_bad_url(redirected_url)
                return

        if 'content-type' not in r.headers or not r.headers['content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url, stream=True, timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all('a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug("Skipping because internal link")
                continue

            log.debug("Checking sublink {0}".format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url, want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                    log.debug("Evil sublink {0} by google API".format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='add',
                commands={
                    'link': Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': Command.raw_command(self.add_link_blacklist,
                                level=500,
                                description='Blacklist a link',
                                examples=[
                                    CommandExample(None, 'Add a link to the blacklist for shallow search',
                                        chat='user:!add link blacklist 0 scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a shallow search').parse(),
                                    CommandExample(None, 'Add a link to the blacklist for deep search',
                                        chat='user:!add link blacklist 1 scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a deep search').parse(),
                                    ]),
                            'whitelist': Command.raw_command(self.add_link_whitelist,
                                level=500,
                                description='Whitelist a link',
                                examples=[
                                    CommandExample(None, 'Add a link to the whitelist',
                                        chat='user:!add link whitelink safelink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link safelink.lonk/ to the whitelist').parse(),
                                    ]),
                            }
                        )
                    }
                )

        self.commands['remove'] = Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='remove',
                commands={
                    'link': Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': Command.raw_command(self.remove_link_blacklist,
                                level=500,
                                description='Unblacklist a link',
                                examples=[
                                    CommandExample(None, 'Remove a blacklist link',
                                        chat='user:!remove link blacklist scamtwitch.scam\n'
                                        'bot>user:Successfully removed your links',
                                        description='Removes scamtwitch.scam as a blacklisted link').parse(),
                                    ]),
                            'whitelist': Command.raw_command(self.remove_link_whitelist,
                                level=500,
                                description='Unwhitelist a link',
                                examples=[
                                    CommandExample(None, 'Remove a whitelist link',
                                        chat='user:!remove link whitelist twitch.safe\n'
                                        'bot>user:Successfully removed your links',
                                        description='Removes twitch.safe as a whitelisted link').parse(),
                                    ]),
                            }
                        ),
                    }
                )

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            if not parts[0].isnumeric():
                for link in parts:
                    self.blacklist_url(link)
            else:
                for link in parts[1:]:
                    self.blacklist_url(link, level=int(parts[0]))
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'blacklist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')

    def remove_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'whitelist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')
Example #4
0
class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    CATEGORY = 'Filter'
    SETTINGS = [
        ModuleSetting(key='ban_pleb_links',
                      label='Disallow links from non-subscribers',
                      type='boolean',
                      required=True,
                      default=False)
    ]

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache(
        )  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        pajbot.managers.handler.HandlerManager.add_handler('on_message',
                                                           self.on_message,
                                                           priority=100)
        pajbot.managers.handler.HandlerManager.add_handler(
            'on_commit', self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config['main']['safebrowsingapi'], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        pajbot.managers.handler.HandlerManager.remove_handler(
            'on_message', self.on_message)
        pajbot.managers.handler.HandlerManager.remove_handler(
            'on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(
            len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    super_whitelist = ['pajlada.se', 'pajlada.com', 'forsen.tv', 'pajbot.com']

    def on_message(self, source, message, emotes, whisper, urls, event):
        if not whisper and source.level < 500 and source.moderator is False:
            if self.settings[
                    'ban_pleb_links'] is True and source.subscriber is False and len(
                        urls) > 0:
                # Check if the links are in our super-whitelist. i.e. on the pajlada.se domain o forsen.tv
                for url in urls:
                    parsed_url = Url(url)
                    if len(parsed_url.parsed.netloc.split('.')) < 2:
                        continue
                    whitelisted = False
                    for whitelist in self.super_whitelist:
                        if is_subdomain(parsed_url.parsed.netloc, whitelist):
                            whitelisted = True
                            break
                    if whitelisted is False:
                        self.bot.timeout(source.username,
                                         30,
                                         reason='Non-subs cannot post links')
                        if source.minutes_in_chat_online > 60:
                            self.bot.whisper(
                                source.username,
                                'You cannot post non-verified links in chat if you\'re not a subscriber.'
                            )
                        return False

            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout,
                                args=[source.username, 20],
                                kwargs={'reason': 'Banned link'})
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug('LinkChecker: Removing url {0} from cache'.format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug('LinkChecker: Caching url {0} as {1}'.format(
            url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self,
                           url,
                           action=None,
                           want_to_cache=True,
                           want_to_blacklist=False):
        log.debug('LinkChecker: BAD URL FOUND {0}'.format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)
            return True

    def blacklist_url(self, url, parsed_url=None, level=0):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        self.db_session.commit()

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)
        self.db_session.commit()

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug('LinkChecker: Url {0} found in cache'.format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug('LinkChecker: Url {0} is blacklisted'.format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug('LinkChecker: Url {0} allowed by the whitelist'.format(
                url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception('LinkChecker unhanled exception while _check_url')

    def _check_url(self, url, action):
        log.debug('LinkChecker: Checking url {0}'.format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url,
                              allow_redirects=True,
                              timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers
                            and r.headers['content-type']
                            == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers
                          and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(
                    redirected_url.url):  # harmful url detected
                log.debug('Bad url because google api')
                self.counteract_bad_url(url, action, want_to_blacklist=False)
                self.counteract_bad_url(redirected_url,
                                        want_to_blacklist=False)
                return

        if 'content-type' not in r.headers or not r.headers[
                'content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url,
                                    stream=True,
                                    timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(
                    response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(
                url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all(
                'a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug('Skipping because internal link')
                continue

            log.debug('Checking sublink {0}'.format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url,
                                        want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url,
                                  allow_redirects=True,
                                  timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url,
                                            want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url,
                                            want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(
                        redirected_url.url):  # harmful url detected
                    log.debug('Evil sublink {0} by google API'.format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = pajbot.models.command.Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='add',
            commands={
                'link':
                pajbot.models.command.Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        pajbot.models.
                        command.Command.raw_command(
                            self
                            .add_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description=
                            'Blacklist a link',
                            examples=[
                                pajbot.
                                models.command.CommandExample(
                                    None,
                                    'Add a link to the blacklist for a shallow search',
                                    chat=
                                    'user:!add link blacklist --shallow scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a shallow search'
                                ).parse(),
                                pajbot.models.command.CommandExample(
                                    None,
                                    'Add a link to the blacklist for a deep search',
                                    chat=
                                    'user:!add link blacklist --deep scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a deep search'
                                ).parse(),
                            ]),
                        'whitelist':
                        pajbot.models.command.Command.raw_command(
                            self.add_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description='Whitelist a link',
                            examples=[
                                pajbot.models.command.CommandExample(
                                    None,
                                    'Add a link to the whitelist',
                                    chat=
                                    'user:!add link whitelink safelink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link safelink.lonk/ to the whitelist'
                                ).parse(),
                            ]),
                    })
            })

        self.commands['remove'] = pajbot.models.command.Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='remove',
            commands={
                'link':
                pajbot.models.command.Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        pajbot.models.
                        command.Command.raw_command(
                            self
                            .remove_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description=
                            'Remove a link from the blacklist.',
                            examples=[
                                pajbot.
                                models.command.CommandExample(
                                    None,
                                    'Remove a link from the blacklist.',
                                    chat
                                    ='user:!remove link blacklist 20\n'
                                    'bot>user:Successfully removed blacklisted link with id 20',
                                    description=
                                    'Remove a link from the blacklist with an ID'
                                ).parse(),
                            ]),
                        'whitelist':
                        pajbot.models.command.Command.raw_command(
                            self
                            .remove_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description='Remove a link from the whitelist.',
                            examples=[
                                pajbot.models.command.CommandExample(
                                    None,
                                    'Remove a link from the whitelist.',
                                    chat='user:!remove link whitelist 12\n'
                                    'bot>user:Successfully removed blacklisted link with id 12',
                                    description=
                                    'Remove a link from the whitelist with an ID'
                                ).parse(),
                            ]),
                    }),
            })

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        options, new_links = self.parse_link_blacklist_arguments(message)

        if new_links:
            parts = new_links.split(' ')
            try:
                for link in parts:
                    if len(link) > 1:
                        self.blacklist_url(link, **options)
                        AdminLogManager.post('Blacklist link added', source,
                                             link)
                bot.whisper(source.username, 'Successfully added your links')
                return True
            except:
                log.exception('Unhandled exception in add_link_blacklist')
                bot.whisper(source.username,
                            'Some error occurred while adding your links')
                return False
        else:
            bot.whisper(source.username, 'Usage: !add link blacklist LINK')
            return False

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
                AdminLogManager.post('Whitelist link added', source, link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(BlacklistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Blacklist link removed', source, link.domain)
            bot.whisper(
                source.username,
                'Successfully removed blacklisted link with id {0}'.format(
                    link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link blacklist ID')
            return False

    def remove_link_whitelist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(WhitelistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Whitelist link removed', source, link.domain)
            bot.whisper(
                source.username,
                'Successfully removed whitelisted link with id {0}'.format(
                    link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link whitelist ID')
            return False

    def parse_link_blacklist_arguments(self, message):
        parser = argparse.ArgumentParser()
        parser.add_argument('--deep', dest='level', action='store_true')
        parser.add_argument('--shallow', dest='level', action='store_false')
        parser.set_defaults(level=False)

        try:
            args, unknown = parser.parse_known_args(message.split())
        except SystemExit:
            return False, False
        except:
            log.exception('Unhandled exception in add_link_blacklist')
            return False, False

        # Strip options of any values that are set as None
        options = {k: v for k, v in vars(args).items() if v is not None}
        response = ' '.join(unknown)

        return options, response
Example #5
0
class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    SETTINGS = []

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache(
        )  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        if bot:
            bot.add_handler('on_message', self.on_message, priority=100)
            bot.add_handler('on_commit', self.on_commit)
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config['main']['safebrowsingapi'], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        if bot:
            bot.remove_handler('on_message', self.on_message)
            bot.remove_handler('on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(
            len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    def on_message(self, source, message, emotes, whisper, urls):
        if not whisper and source.level < 500 and source.moderator is False:
            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout, args=[source.username, 20])
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug("LinkChecker: Removing url {0} from cache".format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug("LinkChecker: Caching url {0} as {1}".format(
            url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self,
                           url,
                           action=None,
                           want_to_cache=True,
                           want_to_blacklist=True):
        log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)

    def unlist_url(self, url, list_type, parsed_url=None):
        """ list_type is either 'blacklist' or 'whitelist' """
        if not (url.startswith('http://') or url.startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        domain = parsed_url.netloc
        path = parsed_url.path

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        if list_type == 'blacklist':
            link = self.db_session.query(BlacklistedLink).filter_by(
                domain=domain, path=path).one_or_none()
            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))
        elif list_type == 'whitelist':
            link = self.db_session.query(WhitelistedLink).filter_by(
                domain=domain, path=path).one_or_none()
            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))

    def blacklist_url(self, url, parsed_url=None, level=1):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        return True

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug("LinkChecker: Url {0} found in cache".format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug("LinkChecker: Url {0} is blacklisted".format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug("LinkChecker: Url {0} allowed by the whitelist".format(
                url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception("LinkChecker unhanled exception while _check_url")

    def _check_url(self, url, action):
        log.debug("LinkChecker: Checking url {0}".format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url,
                              allow_redirects=True,
                              timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers
                            and r.headers['content-type']
                            == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers
                          and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(
                    redirected_url.url):  # harmful url detected
                log.debug("Bad url because google api")
                self.counteract_bad_url(url, action)
                self.counteract_bad_url(redirected_url)
                return

        if 'content-type' not in r.headers or not r.headers[
                'content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url,
                                    stream=True,
                                    timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(
                    response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(
                url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all(
                'a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug("Skipping because internal link")
                continue

            log.debug("Checking sublink {0}".format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url,
                                        want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url,
                                  allow_redirects=True,
                                  timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url,
                                            want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url,
                                            want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(
                        redirected_url.url):  # harmful url detected
                    log.debug("Evil sublink {0} by google API".format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='add',
            commands={
                'link':
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        Command.raw_command(
                            self.add_link_blacklist,
                            level=500,
                            description='Blacklist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Add a link to the blacklist for shallow search',
                                    chat=
                                    'user:!add link blacklist 0 scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a shallow search'
                                ).parse(),
                                CommandExample(
                                    None,
                                    'Add a link to the blacklist for deep search',
                                    chat=
                                    'user:!add link blacklist 1 scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a deep search'
                                ).parse(),
                            ]),
                        'whitelist':
                        Command.raw_command(
                            self.add_link_whitelist,
                            level=500,
                            description='Whitelist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Add a link to the whitelist',
                                    chat=
                                    'user:!add link whitelink safelink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link safelink.lonk/ to the whitelist'
                                ).parse(),
                            ]),
                    })
            })

        self.commands['remove'] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='remove',
            commands={
                'link':
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        Command.raw_command(
                            self.remove_link_blacklist,
                            level=500,
                            description='Unblacklist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Remove a blacklist link',
                                    chat=
                                    'user:!remove link blacklist scamtwitch.scam\n'
                                    'bot>user:Successfully removed your links',
                                    description=
                                    'Removes scamtwitch.scam as a blacklisted link'
                                ).parse(),
                            ]),
                        'whitelist':
                        Command.raw_command(
                            self.remove_link_whitelist,
                            level=500,
                            description='Unwhitelist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Remove a whitelist link',
                                    chat=
                                    'user:!remove link whitelist twitch.safe\n'
                                    'bot>user:Successfully removed your links',
                                    description=
                                    'Removes twitch.safe as a whitelisted link'
                                ).parse(),
                            ]),
                    }),
            })

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            if not parts[0].isnumeric():
                for link in parts:
                    self.blacklist_url(link)
            else:
                for link in parts[1:]:
                    self.blacklist_url(link, level=int(parts[0]))
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'blacklist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')

    def remove_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'whitelist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')
Example #6
0
class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    CATEGORY = 'Filter'
    SETTINGS = [
            ModuleSetting(
                key='ban_pleb_links',
                label='Disallow links from non-subscribers',
                type='boolean',
                required=True,
                default=False)
            ]

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache()  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        pajbot.managers.handler.HandlerManager.add_handler('on_message', self.on_message, priority=100)
        pajbot.managers.handler.HandlerManager.add_handler('on_commit', self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(bot.config['main']['safebrowsingapi'], bot.nickname, bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        pajbot.managers.handler.HandlerManager.remove_handler('on_message', self.on_message)
        pajbot.managers.handler.HandlerManager.remove_handler('on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    super_whitelist = ['pajlada.se', 'pajlada.com', 'forsen.tv', 'pajbot.com']

    def on_message(self, source, message, emotes, whisper, urls, event):
        if not whisper and source.level < 500 and source.moderator is False:
            if self.settings['ban_pleb_links'] is True and source.subscriber is False and len(urls) > 0:
                # Check if the links are in our super-whitelist. i.e. on the pajlada.se domain o forsen.tv
                for url in urls:
                    parsed_url = Url(url)
                    if len(parsed_url.parsed.netloc.split('.')) < 2:
                        continue
                    whitelisted = False
                    for whitelist in self.super_whitelist:
                        if is_subdomain(parsed_url.parsed.netloc, whitelist):
                            whitelisted = True
                            break
                    if whitelisted is False:
                        self.bot.timeout(source.username, 30, reason='Non-subs cannot post links')
                        if source.minutes_in_chat_online > 60:
                            self.bot.whisper(source.username, 'You cannot post non-verified links in chat if you\'re not a subscriber.')
                        return False

            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout, args=[source.username, 20], kwargs={'reason': 'Banned link'})
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug('LinkChecker: Removing url {0} from cache'.format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug('LinkChecker: Caching url {0} as {1}'.format(url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self, url, action=None, want_to_cache=True, want_to_blacklist=False):
        log.debug('LinkChecker: BAD URL FOUND {0}'.format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)
            return True

    def blacklist_url(self, url, parsed_url=None, level=0):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        self.db_session.commit()

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)
        self.db_session.commit()

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug('LinkChecker: Url {0} found in cache'.format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug('LinkChecker: Url {0} is blacklisted'.format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug('LinkChecker: Url {0} allowed by the whitelist'.format(url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception('LinkChecker unhanled exception while _check_url')

    def _check_url(self, url, action):
        log.debug('LinkChecker: Checking url {0}'.format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers and r.headers['content-type'] == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                log.debug('Bad url because google api')
                self.counteract_bad_url(url, action, want_to_blacklist=False)
                self.counteract_bad_url(redirected_url, want_to_blacklist=False)
                return

        if 'content-type' not in r.headers or not r.headers['content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url, stream=True, timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all('a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug('Skipping because internal link')
                continue

            log.debug('Checking sublink {0}'.format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url, want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                    log.debug('Evil sublink {0} by google API'.format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = pajbot.models.command.Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='add',
                commands={
                    'link': pajbot.models.command.Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': pajbot.models.command.Command.raw_command(self.add_link_blacklist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Blacklist a link',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Add a link to the blacklist for a shallow search',
                                        chat='user:!add link blacklist --shallow scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a shallow search').parse(),
                                    pajbot.models.command.CommandExample(None, 'Add a link to the blacklist for a deep search',
                                        chat='user:!add link blacklist --deep scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a deep search').parse(),
                                    ]),
                            'whitelist': pajbot.models.command.Command.raw_command(self.add_link_whitelist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Whitelist a link',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Add a link to the whitelist',
                                        chat='user:!add link whitelink safelink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link safelink.lonk/ to the whitelist').parse(),
                                    ]),
                            }
                        )
                    }
                )

        self.commands['remove'] = pajbot.models.command.Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='remove',
                commands={
                    'link': pajbot.models.command.Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': pajbot.models.command.Command.raw_command(self.remove_link_blacklist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Remove a link from the blacklist.',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Remove a link from the blacklist.',
                                        chat='user:!remove link blacklist 20\n'
                                        'bot>user:Successfully removed blacklisted link with id 20',
                                        description='Remove a link from the blacklist with an ID').parse(),
                                    ]),
                            'whitelist': pajbot.models.command.Command.raw_command(self.remove_link_whitelist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Remove a link from the whitelist.',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Remove a link from the whitelist.',
                                        chat='user:!remove link whitelist 12\n'
                                        'bot>user:Successfully removed blacklisted link with id 12',
                                        description='Remove a link from the whitelist with an ID').parse(),
                                    ]),
                            }
                        ),
                    }
                )

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        options, new_links = self.parse_link_blacklist_arguments(message)

        if new_links:
            parts = new_links.split(' ')
            try:
                for link in parts:
                    if len(link) > 1:
                        self.blacklist_url(link, **options)
                        AdminLogManager.post('Blacklist link added', source, link)
                bot.whisper(source.username, 'Successfully added your links')
                return True
            except:
                log.exception('Unhandled exception in add_link_blacklist')
                bot.whisper(source.username, 'Some error occurred while adding your links')
                return False
        else:
            bot.whisper(source.username, 'Usage: !add link blacklist LINK')
            return False

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
                AdminLogManager.post('Whitelist link added', source, link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(BlacklistedLink).filter_by(id=id).one_or_none()

            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Blacklist link removed', source, link.domain)
            bot.whisper(source.username, 'Successfully removed blacklisted link with id {0}'.format(link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link blacklist ID')
            return False

    def remove_link_whitelist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(WhitelistedLink).filter_by(id=id).one_or_none()

            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Whitelist link removed', source, link.domain)
            bot.whisper(source.username, 'Successfully removed whitelisted link with id {0}'.format(link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link whitelist ID')
            return False

    def parse_link_blacklist_arguments(self, message):
        parser = argparse.ArgumentParser()
        parser.add_argument('--deep', dest='level', action='store_true')
        parser.add_argument('--shallow', dest='level', action='store_false')
        parser.set_defaults(level=False)

        try:
            args, unknown = parser.parse_known_args(message.split())
        except SystemExit:
            return False, False
        except:
            log.exception('Unhandled exception in add_link_blacklist')
            return False, False

        # Strip options of any values that are set as None
        options = {k: v for k, v in vars(args).items() if v is not None}
        response = ' '.join(unknown)

        return options, response
Example #7
0
class LinkCheckerModule(BaseModule):

    ID = __name__.split(".")[-1]
    NAME = "Link Checker"
    DESCRIPTION = "Checks links if they're bad"
    ENABLED_DEFAULT = True
    CATEGORY = "Filter"
    SETTINGS = [
        ModuleSetting(
            key="ban_pleb_links",
            label="Disallow links from non-subscribers",
            type="boolean",
            required=True,
            default=False,
        ),
        ModuleSetting(key="ban_sub_links",
                      label="Disallow links from subscribers",
                      type="boolean",
                      required=True,
                      default=False),
        ModuleSetting(
            key="timeout_length",
            label="Timeout length",
            type="number",
            required=True,
            placeholder="Timeout length in seconds",
            default=60,
            constraints={
                "min_value": 1,
                "max_value": 3600
            },
        ),
    ]

    def __init__(self, bot):
        super().__init__(bot)
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache(
        )  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        HandlerManager.add_handler("on_message", self.on_message, priority=100)
        HandlerManager.add_handler("on_commit", self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if "safebrowsingapi" in bot.config["main"]:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config["main"]["safebrowsingapi"], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        pajbot.managers.handler.HandlerManager.remove_handler(
            "on_message", self.on_message)
        pajbot.managers.handler.HandlerManager.remove_handler(
            "on_commit", self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info("Loaded {0} bad links and {1} good links".format(
            len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    super_whitelist = ["pajlada.se", "pajlada.com", "forsen.tv", "pajbot.com"]

    def on_message(self, source, whisper, urls, **rest):
        if whisper:
            return

        if source.level >= 500 or source.moderator is True:
            return

        if len(urls) > 0:
            do_timeout = False
            ban_reason = "You are not allowed to post links in chat"
            whisper_reason = "??? KKona"

            if self.settings[
                    "ban_pleb_links"] is True and source.subscriber is False:
                do_timeout = True
                whisper_reason = "You cannot post non-verified links in chat if you're not a subscriber."
            elif self.settings[
                    "ban_sub_links"] is True and source.subscriber is True:
                do_timeout = True
                whisper_reason = "You cannot post non-verified links in chat."

            if do_timeout is True:
                # Check if the links are in our super-whitelist. i.e. on the pajlada.se domain o forsen.tv
                for url in urls:
                    parsed_url = Url(url)
                    if len(parsed_url.parsed.netloc.split(".")) < 2:
                        continue
                    whitelisted = False
                    for whitelist in self.super_whitelist:
                        if is_subdomain(parsed_url.parsed.netloc, whitelist):
                            whitelisted = True
                            break
                    if whitelisted is False:
                        self.bot.timeout(source.username,
                                         30,
                                         reason=ban_reason)
                        if source.minutes_in_chat_online > 60:
                            self.bot.whisper(source.username, whisper_reason)
                        return False

        for url in urls:
            # Action which will be taken when a bad link is found
            action = Action(
                self.bot.timeout,
                args=[source.username, self.settings["timeout_length"]],
                kwargs={"reason": "Banned link"},
            )
            # First we perform a basic check
            if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                # If the basic check returns no relevant data, we queue up a proper check on the URL
                self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self, **rest):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug("LinkChecker: Removing url {0} from cache".format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug("LinkChecker: Caching url {0} as {1}".format(
            url, "SAFE" if safe is True else "UNSAFE"))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self,
                           url,
                           action=None,
                           want_to_cache=True,
                           want_to_blacklist=False):
        log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)
            return True

    def blacklist_url(self, url, parsed_url=None, level=0):
        if not (url.lower().startswith("http://")
                or url.lower().startswith("https://")):
            url = "http://" + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith("www."):
            domain = domain[4:]
        if path.endswith("/"):
            path = path[:-1]
        if path == "":
            path = "/"

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        self.db_session.commit()

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith("http://")
                or url.lower().startswith("https://")):
            url = "http://" + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith("www."):
            domain = domain[4:]
        if path.endswith("/"):
            path = path[:-1]
        if path == "":
            path = "/"

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)
        self.db_session.commit()

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == "":
            path = "/"

        domain_split = domain.split(".")
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif (
                            link.level >= 1
                    ):  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == "":
            path = "/"

        domain_split = domain.split(".")
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug("LinkChecker: Url {0} found in cache".format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info("Checking if link is blacklisted...")
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug("LinkChecker: Url {0} is blacklisted".format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info("Checking if link is whitelisted...")
        if self.is_whitelisted(url.url, url.parsed):
            log.debug("LinkChecker: Url {0} allowed by the whitelist".format(
                url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split(".")) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split(".")) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception("LinkChecker unhanled exception while _check_url")

    def _check_url(self, url, action):
        log.debug("LinkChecker: Checking url {0}".format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url,
                              allow_redirects=True,
                              timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = "content-type" in r.headers and r.headers[
            "content-type"] == "application/octet-stream"
        checkdispotype = "disposition-type" in r.headers and r.headers[
            "disposition-type"] == "attachment"

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(
                    redirected_url.url):  # harmful url detected
                log.debug("Bad url because google api")
                self.counteract_bad_url(url, action, want_to_blacklist=False)
                self.counteract_bad_url(redirected_url,
                                        want_to_blacklist=False)
                return

        if "content-type" not in r.headers or not r.headers[
                "content-type"].startswith("text/html"):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ""
        try:
            response = requests.get(url=url.url,
                                    stream=True,
                                    timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get("Content-Length")
            if content_length and int(
                    response.headers.get("Content-Length")) > maximum_size:
                log.error("This file is too big!")
                return

            size = 0
            start = pajbot.utils.now().timestamp()

            for chunk in response.iter_content(1024):
                if pajbot.utils.now().timestamp() - start > receive_timeout:
                    log.error("The site took too long to load")
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error("This file is too big! (fake header)")
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning("Connection timed out while checking {0}".format(
                url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning("Reading timed out while checking {0}".format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception("Unhandled exception")
            return

        try:
            soup = BeautifulSoup(html, "html.parser")
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all(
                "a"):  # get a list of links to external sites
            url = link.get("href")
            if url is None:
                continue
            if url.startswith("//"):
                urls.append("http:" + url)
            elif url.startswith("http://") or url.startswith("https://"):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug('Skipping because internal link')
                continue

            log.debug("Checking sublink {0}".format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url,
                                        want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url,
                                  allow_redirects=True,
                                  timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url,
                                            want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url,
                                            want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(
                        redirected_url.url):  # harmful url detected
                    log.debug("Evil sublink {0} by google API".format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands["add"] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command="add",
            commands={
                "link":
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        "blacklist":
                        Command.raw_command(
                            self.add_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Blacklist a link",
                            examples=[
                                CommandExample(
                                    None,
                                    "Add a link to the blacklist for a shallow search",
                                    chat=
                                    "user:!add link blacklist --shallow scamlink.lonk/\n"
                                    "bot>user:Successfully added your links",
                                    description=
                                    "Added the link scamlink.lonk/ to the blacklist for a shallow search",
                                ).parse(),
                                CommandExample(
                                    None,
                                    "Add a link to the blacklist for a deep search",
                                    chat=
                                    "user:!add link blacklist --deep scamlink.lonk/\n"
                                    "bot>user:Successfully added your links",
                                    description=
                                    "Added the link scamlink.lonk/ to the blacklist for a deep search",
                                ).parse(),
                            ],
                        ),
                        "whitelist":
                        Command.raw_command(
                            self.add_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Whitelist a link",
                            examples=[
                                CommandExample(
                                    None,
                                    "Add a link to the whitelist",
                                    chat=
                                    "user:!add link whitelink safelink.lonk/\n"
                                    "bot>user:Successfully added your links",
                                    description=
                                    "Added the link safelink.lonk/ to the whitelist",
                                ).parse()
                            ],
                        ),
                    },
                )
            },
        )

        self.commands["remove"] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command="remove",
            commands={
                "link":
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        "blacklist":
                        Command.raw_command(
                            self.remove_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Remove a link from the blacklist.",
                            examples=[
                                CommandExample(
                                    None,
                                    "Remove a link from the blacklist.",
                                    chat="user:!remove link blacklist 20\n"
                                    "bot>user:Successfully removed blacklisted link with id 20",
                                    description=
                                    "Remove a link from the blacklist with an ID",
                                ).parse()
                            ],
                        ),
                        "whitelist":
                        Command.raw_command(
                            self.remove_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Remove a link from the whitelist.",
                            examples=[
                                CommandExample(
                                    None,
                                    "Remove a link from the whitelist.",
                                    chat="user:!remove link whitelist 12\n"
                                    "bot>user:Successfully removed blacklisted link with id 12",
                                    description=
                                    "Remove a link from the whitelist with an ID",
                                ).parse()
                            ],
                        ),
                    },
                )
            },
        )

    def add_link_blacklist(self, **options):
        bot = options["bot"]
        message = options["message"]
        source = options["source"]

        options, new_links = self.parse_link_blacklist_arguments(message)

        if new_links:
            parts = new_links.split(" ")
            try:
                for link in parts:
                    if len(link) > 1:
                        self.blacklist_url(link, **options)
                        AdminLogManager.post("Blacklist link added", source,
                                             link)
                bot.whisper(source.username, "Successfully added your links")
                return True
            except:
                log.exception("Unhandled exception in add_link_blacklist")
                bot.whisper(source.username,
                            "Some error occurred while adding your links")
                return False
        else:
            bot.whisper(source.username, "Usage: !add link blacklist LINK")
            return False

    def add_link_whitelist(self, **options):
        bot = options["bot"]
        message = options["message"]
        source = options["source"]

        parts = message.split(" ")
        try:
            for link in parts:
                self.whitelist_url(link)
                AdminLogManager.post("Whitelist link added", source, link)
        except:
            log.exception("Unhandled exception in add_link")
            bot.whisper(source.username,
                        "Some error occurred white adding your links")
            return False

        bot.whisper(source.username, "Successfully added your links")

    def remove_link_blacklist(self, **options):
        message = options["message"]
        bot = options["bot"]
        source = options["source"]

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(BlacklistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, "No link with the given id found")
                return False

            AdminLogManager.post("Blacklist link removed", source, link.domain)
            bot.whisper(
                source.username,
                "Successfully removed blacklisted link with id {0}".format(
                    link.id))
        else:
            bot.whisper(source.username, "Usage: !remove link blacklist ID")
            return False

    def remove_link_whitelist(self, **options):
        message = options["message"]
        bot = options["bot"]
        source = options["source"]

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(WhitelistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, "No link with the given id found")
                return False

            AdminLogManager.post("Whitelist link removed", source, link.domain)
            bot.whisper(
                source.username,
                "Successfully removed whitelisted link with id {0}".format(
                    link.id))
        else:
            bot.whisper(source.username, "Usage: !remove link whitelist ID")
            return False

    @staticmethod
    def parse_link_blacklist_arguments(message):
        parser = argparse.ArgumentParser()
        parser.add_argument("--deep", dest="level", action="store_true")
        parser.add_argument("--shallow", dest="level", action="store_false")
        parser.set_defaults(level=False)

        try:
            args, unknown = parser.parse_known_args(message.split())
        except SystemExit:
            return False, False
        except:
            log.exception("Unhandled exception in add_link_blacklist")
            return False, False

        # Strip options of any values that are set as None
        options = {k: v for k, v in vars(args).items() if v is not None}
        response = " ".join(unknown)

        return options, response