Python SafeBrowsingAPI Examples

Programming Language: Python

Namespace/Package Name: pajbot.apiwrappers

Class/Type: SafeBrowsingAPI

Examples at hotexamples.com: 7

Python SafeBrowsingAPI - 7 examples found. These are the top rated real world Python examples of pajbot.apiwrappers.SafeBrowsingAPI extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

check_url(3)

SafeBrowsingAPI(1)

Example #1

Show file

File: linkchecker.py Project: smackedlol/pajbot

    def enable(self, bot):
        self.bot = bot
        pajbot.managers.handler.HandlerManager.add_handler('on_message',
                                                           self.on_message,
                                                           priority=100)
        pajbot.managers.handler.HandlerManager.add_handler(
            'on_commit', self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config['main']['safebrowsingapi'], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

Example #2

Show file

File: linkchecker.py Project: ManikDV/pajbot

    def enable(self, bot):
        self.bot = bot
        if bot:
            bot.add_handler('on_message', self.on_message, priority=100)
            bot.add_handler('on_commit', self.on_commit)
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(bot.config['main']['safebrowsingapi'], bot.nickname, bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

Example #3

Show file

File: linkchecker.py Project: ManikDV/pajbot

class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    SETTINGS = []

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache()  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        if bot:
            bot.add_handler('on_message', self.on_message, priority=100)
            bot.add_handler('on_commit', self.on_commit)
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(bot.config['main']['safebrowsingapi'], bot.nickname, bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        if bot:
            bot.remove_handler('on_message', self.on_message)
            bot.remove_handler('on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    def on_message(self, source, message, emotes, whisper, urls):
        if not whisper and source.level < 500 and source.moderator is False:
            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout, args=[source.username, 20])
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug("LinkChecker: Removing url {0} from cache".format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug("LinkChecker: Caching url {0} as {1}".format(url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self, url, action=None, want_to_cache=True, want_to_blacklist=True):
        log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)

    def unlist_url(self, url, list_type, parsed_url=None):
        """ list_type is either 'blacklist' or 'whitelist' """
        if not (url.startswith('http://') or url.startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        domain = parsed_url.netloc
        path = parsed_url.path

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        if list_type == 'blacklist':
            link = self.db_session.query(BlacklistedLink).filter_by(domain=domain, path=path).one_or_none()
            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))
        elif list_type == 'whitelist':
            link = self.db_session.query(WhitelistedLink).filter_by(domain=domain, path=path).one_or_none()
            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))

    def blacklist_url(self, url, parsed_url=None, level=1):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        return True

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug("LinkChecker: Url {0} found in cache".format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug("LinkChecker: Url {0} is blacklisted".format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug("LinkChecker: Url {0} allowed by the whitelist".format(url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception("LinkChecker unhanled exception while _check_url")

    def _check_url(self, url, action):
        log.debug("LinkChecker: Checking url {0}".format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers and r.headers['content-type'] == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                log.debug("Bad url because google api")
                self.counteract_bad_url(url, action)
                self.counteract_bad_url(redirected_url)
                return

        if 'content-type' not in r.headers or not r.headers['content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url, stream=True, timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all('a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug("Skipping because internal link")
                continue

            log.debug("Checking sublink {0}".format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url, want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                    log.debug("Evil sublink {0} by google API".format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='add',
                commands={
                    'link': Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': Command.raw_command(self.add_link_blacklist,
                                level=500,
                                description='Blacklist a link',
                                examples=[
                                    CommandExample(None, 'Add a link to the blacklist for shallow search',
                                        chat='user:!add link blacklist 0 scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a shallow search').parse(),
                                    CommandExample(None, 'Add a link to the blacklist for deep search',
                                        chat='user:!add link blacklist 1 scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a deep search').parse(),
                                    ]),
                            'whitelist': Command.raw_command(self.add_link_whitelist,
                                level=500,
                                description='Whitelist a link',
                                examples=[
                                    CommandExample(None, 'Add a link to the whitelist',
                                        chat='user:!add link whitelink safelink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link safelink.lonk/ to the whitelist').parse(),
                                    ]),
                            }
                        )
                    }
                )

        self.commands['remove'] = Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='remove',
                commands={
                    'link': Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': Command.raw_command(self.remove_link_blacklist,
                                level=500,
                                description='Unblacklist a link',
                                examples=[
                                    CommandExample(None, 'Remove a blacklist link',
                                        chat='user:!remove link blacklist scamtwitch.scam\n'
                                        'bot>user:Successfully removed your links',
                                        description='Removes scamtwitch.scam as a blacklisted link').parse(),
                                    ]),
                            'whitelist': Command.raw_command(self.remove_link_whitelist,
                                level=500,
                                description='Unwhitelist a link',
                                examples=[
                                    CommandExample(None, 'Remove a whitelist link',
                                        chat='user:!remove link whitelist twitch.safe\n'
                                        'bot>user:Successfully removed your links',
                                        description='Removes twitch.safe as a whitelisted link').parse(),
                                    ]),
                            }
                        ),
                    }
                )

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            if not parts[0].isnumeric():
                for link in parts:
                    self.blacklist_url(link)
            else:
                for link in parts[1:]:
                    self.blacklist_url(link, level=int(parts[0]))
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'blacklist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')

    def remove_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'whitelist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')

Example #4

Show file

File: linkchecker.py Project: smackedlol/pajbot

class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    CATEGORY = 'Filter'
    SETTINGS = [
        ModuleSetting(key='ban_pleb_links',
                      label='Disallow links from non-subscribers',
                      type='boolean',
                      required=True,
                      default=False)
    ]

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache(
        )  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        pajbot.managers.handler.HandlerManager.add_handler('on_message',
                                                           self.on_message,
                                                           priority=100)
        pajbot.managers.handler.HandlerManager.add_handler(
            'on_commit', self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config['main']['safebrowsingapi'], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        pajbot.managers.handler.HandlerManager.remove_handler(
            'on_message', self.on_message)
        pajbot.managers.handler.HandlerManager.remove_handler(
            'on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(
            len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    super_whitelist = ['pajlada.se', 'pajlada.com', 'forsen.tv', 'pajbot.com']

    def on_message(self, source, message, emotes, whisper, urls, event):
        if not whisper and source.level < 500 and source.moderator is False:
            if self.settings[
                    'ban_pleb_links'] is True and source.subscriber is False and len(
                        urls) > 0:
                # Check if the links are in our super-whitelist. i.e. on the pajlada.se domain o forsen.tv
                for url in urls:
                    parsed_url = Url(url)
                    if len(parsed_url.parsed.netloc.split('.')) < 2:
                        continue
                    whitelisted = False
                    for whitelist in self.super_whitelist:
                        if is_subdomain(parsed_url.parsed.netloc, whitelist):
                            whitelisted = True
                            break
                    if whitelisted is False:
                        self.bot.timeout(source.username,
                                         30,
                                         reason='Non-subs cannot post links')
                        if source.minutes_in_chat_online > 60:
                            self.bot.whisper(
                                source.username,
                                'You cannot post non-verified links in chat if you\'re not a subscriber.'
                            )
                        return False

            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout,
                                args=[source.username, 20],
                                kwargs={'reason': 'Banned link'})
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug('LinkChecker: Removing url {0} from cache'.format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug('LinkChecker: Caching url {0} as {1}'.format(
            url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self,
                           url,
                           action=None,
                           want_to_cache=True,
                           want_to_blacklist=False):
        log.debug('LinkChecker: BAD URL FOUND {0}'.format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)
            return True

    def blacklist_url(self, url, parsed_url=None, level=0):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        self.db_session.commit()

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)
        self.db_session.commit()

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug('LinkChecker: Url {0} found in cache'.format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug('LinkChecker: Url {0} is blacklisted'.format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug('LinkChecker: Url {0} allowed by the whitelist'.format(
                url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception('LinkChecker unhanled exception while _check_url')

    def _check_url(self, url, action):
        log.debug('LinkChecker: Checking url {0}'.format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url,
                              allow_redirects=True,
                              timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers
                            and r.headers['content-type']
                            == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers
                          and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(
                    redirected_url.url):  # harmful url detected
                log.debug('Bad url because google api')
                self.counteract_bad_url(url, action, want_to_blacklist=False)
                self.counteract_bad_url(redirected_url,
                                        want_to_blacklist=False)
                return

        if 'content-type' not in r.headers or not r.headers[
                'content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url,
                                    stream=True,
                                    timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(
                    response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(
                url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all(
                'a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug('Skipping because internal link')
                continue

            log.debug('Checking sublink {0}'.format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url,
                                        want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url,
                                  allow_redirects=True,
                                  timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url,
                                            want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url,
                                            want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(
                        redirected_url.url):  # harmful url detected
                    log.debug('Evil sublink {0} by google API'.format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = pajbot.models.command.Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='add',
            commands={
                'link':
                pajbot.models.command.Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        pajbot.models.
                        command.Command.raw_command(
                            self
                            .add_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description=
                            'Blacklist a link',
                            examples=[
                                pajbot.
                                models.command.CommandExample(
                                    None,
                                    'Add a link to the blacklist for a shallow search',
                                    chat=
                                    'user:!add link blacklist --shallow scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a shallow search'
                                ).parse(),
                                pajbot.models.command.CommandExample(
                                    None,
                                    'Add a link to the blacklist for a deep search',
                                    chat=
                                    'user:!add link blacklist --deep scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a deep search'
                                ).parse(),
                            ]),
                        'whitelist':
                        pajbot.models.command.Command.raw_command(
                            self.add_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description='Whitelist a link',
                            examples=[
                                pajbot.models.command.CommandExample(
                                    None,
                                    'Add a link to the whitelist',
                                    chat=
                                    'user:!add link whitelink safelink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link safelink.lonk/ to the whitelist'
                                ).parse(),
                            ]),
                    })
            })

        self.commands['remove'] = pajbot.models.command.Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='remove',
            commands={
                'link':
                pajbot.models.command.Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        pajbot.models.
                        command.Command.raw_command(
                            self
                            .remove_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description=
                            'Remove a link from the blacklist.',
                            examples=[
                                pajbot.
                                models.command.CommandExample(
                                    None,
                                    'Remove a link from the blacklist.',
                                    chat
                                    ='user:!remove link blacklist 20\n'
                                    'bot>user:Successfully removed blacklisted link with id 20',
                                    description=
                                    'Remove a link from the blacklist with an ID'
                                ).parse(),
                            ]),
                        'whitelist':
                        pajbot.models.command.Command.raw_command(
                            self
                            .remove_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description='Remove a link from the whitelist.',
                            examples=[
                                pajbot.models.command.CommandExample(
                                    None,
                                    'Remove a link from the whitelist.',
                                    chat='user:!remove link whitelist 12\n'
                                    'bot>user:Successfully removed blacklisted link with id 12',
                                    description=
                                    'Remove a link from the whitelist with an ID'
                                ).parse(),
                            ]),
                    }),
            })

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        options, new_links = self.parse_link_blacklist_arguments(message)

        if new_links:
            parts = new_links.split(' ')
            try:
                for link in parts:
                    if len(link) > 1:
                        self.blacklist_url(link, **options)
                        AdminLogManager.post('Blacklist link added', source,
                                             link)
                bot.whisper(source.username, 'Successfully added your links')
                return True
            except:
                log.exception('Unhandled exception in add_link_blacklist')
                bot.whisper(source.username,
                            'Some error occurred while adding your links')
                return False
        else:
            bot.whisper(source.username, 'Usage: !add link blacklist LINK')
            return False

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
                AdminLogManager.post('Whitelist link added', source, link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(BlacklistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Blacklist link removed', source, link.domain)
            bot.whisper(
                source.username,
                'Successfully removed blacklisted link with id {0}'.format(
                    link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link blacklist ID')
            return False

    def remove_link_whitelist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(WhitelistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Whitelist link removed', source, link.domain)
            bot.whisper(
                source.username,
                'Successfully removed whitelisted link with id {0}'.format(
                    link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link whitelist ID')
            return False

    def parse_link_blacklist_arguments(self, message):
        parser = argparse.ArgumentParser()
        parser.add_argument('--deep', dest='level', action='store_true')
        parser.add_argument('--shallow', dest='level', action='store_false')
        parser.set_defaults(level=False)

        try:
            args, unknown = parser.parse_known_args(message.split())
        except SystemExit:
            return False, False
        except:
            log.exception('Unhandled exception in add_link_blacklist')
            return False, False

        # Strip options of any values that are set as None
        options = {k: v for k, v in vars(args).items() if v is not None}
        response = ' '.join(unknown)

        return options, response

Example #5

Show file

File: linkchecker.py Project: manikdv/pajbot

class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    SETTINGS = []

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache(
        )  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        if bot:
            bot.add_handler('on_message', self.on_message, priority=100)
            bot.add_handler('on_commit', self.on_commit)
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config['main']['safebrowsingapi'], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        if bot:
            bot.remove_handler('on_message', self.on_message)
            bot.remove_handler('on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(
            len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    def on_message(self, source, message, emotes, whisper, urls):
        if not whisper and source.level < 500 and source.moderator is False:
            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout, args=[source.username, 20])
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug("LinkChecker: Removing url {0} from cache".format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug("LinkChecker: Caching url {0} as {1}".format(
            url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self,
                           url,
                           action=None,
                           want_to_cache=True,
                           want_to_blacklist=True):
        log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)

    def unlist_url(self, url, list_type, parsed_url=None):
        """ list_type is either 'blacklist' or 'whitelist' """
        if not (url.startswith('http://') or url.startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        domain = parsed_url.netloc
        path = parsed_url.path

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        if list_type == 'blacklist':
            link = self.db_session.query(BlacklistedLink).filter_by(
                domain=domain, path=path).one_or_none()
            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))
        elif list_type == 'whitelist':
            link = self.db_session.query(WhitelistedLink).filter_by(
                domain=domain, path=path).one_or_none()
            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
            else:
                log.warning('Unable to unlist {0}{1}'.format(domain, path))

    def blacklist_url(self, url, parsed_url=None, level=1):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        return True

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://')
                or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug("LinkChecker: Url {0} found in cache".format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug("LinkChecker: Url {0} is blacklisted".format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug("LinkChecker: Url {0} allowed by the whitelist".format(
                url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception("LinkChecker unhanled exception while _check_url")

    def _check_url(self, url, action):
        log.debug("LinkChecker: Checking url {0}".format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url,
                              allow_redirects=True,
                              timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers
                            and r.headers['content-type']
                            == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers
                          and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(
                    redirected_url.url):  # harmful url detected
                log.debug("Bad url because google api")
                self.counteract_bad_url(url, action)
                self.counteract_bad_url(redirected_url)
                return

        if 'content-type' not in r.headers or not r.headers[
                'content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url,
                                    stream=True,
                                    timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(
                    response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(
                url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all(
                'a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug("Skipping because internal link")
                continue

            log.debug("Checking sublink {0}".format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url,
                                        want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url,
                                  allow_redirects=True,
                                  timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url,
                                            want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url,
                                            want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(
                        redirected_url.url):  # harmful url detected
                    log.debug("Evil sublink {0} by google API".format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='add',
            commands={
                'link':
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        Command.raw_command(
                            self.add_link_blacklist,
                            level=500,
                            description='Blacklist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Add a link to the blacklist for shallow search',
                                    chat=
                                    'user:!add link blacklist 0 scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a shallow search'
                                ).parse(),
                                CommandExample(
                                    None,
                                    'Add a link to the blacklist for deep search',
                                    chat=
                                    'user:!add link blacklist 1 scamlink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link scamlink.lonk/ to the blacklist for a deep search'
                                ).parse(),
                            ]),
                        'whitelist':
                        Command.raw_command(
                            self.add_link_whitelist,
                            level=500,
                            description='Whitelist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Add a link to the whitelist',
                                    chat=
                                    'user:!add link whitelink safelink.lonk/\n'
                                    'bot>user:Successfully added your links',
                                    description=
                                    'Added the link safelink.lonk/ to the whitelist'
                                ).parse(),
                            ]),
                    })
            })

        self.commands['remove'] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command='remove',
            commands={
                'link':
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        'blacklist':
                        Command.raw_command(
                            self.remove_link_blacklist,
                            level=500,
                            description='Unblacklist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Remove a blacklist link',
                                    chat=
                                    'user:!remove link blacklist scamtwitch.scam\n'
                                    'bot>user:Successfully removed your links',
                                    description=
                                    'Removes scamtwitch.scam as a blacklisted link'
                                ).parse(),
                            ]),
                        'whitelist':
                        Command.raw_command(
                            self.remove_link_whitelist,
                            level=500,
                            description='Unwhitelist a link',
                            examples=[
                                CommandExample(
                                    None,
                                    'Remove a whitelist link',
                                    chat=
                                    'user:!remove link whitelist twitch.safe\n'
                                    'bot>user:Successfully removed your links',
                                    description=
                                    'Removes twitch.safe as a whitelisted link'
                                ).parse(),
                            ]),
                    }),
            })

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            if not parts[0].isnumeric():
                for link in parts:
                    self.blacklist_url(link)
            else:
                for link in parts[1:]:
                    self.blacklist_url(link, level=int(parts[0]))
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'blacklist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')

    def remove_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.unlist_url(link, 'whitelist')
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username,
                        'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully removed your links')

Example #6

Show file

File: linkchecker.py Project: Neclord9/pajbot

class LinkCheckerModule(BaseModule):

    ID = __name__.split('.')[-1]
    NAME = 'Link Checker'
    DESCRIPTION = 'Checks links if they\'re bad'
    ENABLED_DEFAULT = True
    CATEGORY = 'Filter'
    SETTINGS = [
            ModuleSetting(
                key='ban_pleb_links',
                label='Disallow links from non-subscribers',
                type='boolean',
                required=True,
                default=False)
            ]

    def __init__(self):
        super().__init__()
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache()  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        self.bot = bot
        pajbot.managers.handler.HandlerManager.add_handler('on_message', self.on_message, priority=100)
        pajbot.managers.handler.HandlerManager.add_handler('on_commit', self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if 'safebrowsingapi' in bot.config['main']:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(bot.config['main']['safebrowsingapi'], bot.nickname, bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        pajbot.managers.handler.HandlerManager.remove_handler('on_message', self.on_message)
        pajbot.managers.handler.HandlerManager.remove_handler('on_commit', self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info('Loaded {0} bad links and {1} good links'.format(len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    super_whitelist = ['pajlada.se', 'pajlada.com', 'forsen.tv', 'pajbot.com']

    def on_message(self, source, message, emotes, whisper, urls, event):
        if not whisper and source.level < 500 and source.moderator is False:
            if self.settings['ban_pleb_links'] is True and source.subscriber is False and len(urls) > 0:
                # Check if the links are in our super-whitelist. i.e. on the pajlada.se domain o forsen.tv
                for url in urls:
                    parsed_url = Url(url)
                    if len(parsed_url.parsed.netloc.split('.')) < 2:
                        continue
                    whitelisted = False
                    for whitelist in self.super_whitelist:
                        if is_subdomain(parsed_url.parsed.netloc, whitelist):
                            whitelisted = True
                            break
                    if whitelisted is False:
                        self.bot.timeout(source.username, 30, reason='Non-subs cannot post links')
                        if source.minutes_in_chat_online > 60:
                            self.bot.whisper(source.username, 'You cannot post non-verified links in chat if you\'re not a subscriber.')
                        return False

            for url in urls:
                # Action which will be taken when a bad link is found
                action = Action(self.bot.timeout, args=[source.username, 20], kwargs={'reason': 'Banned link'})
                # First we perform a basic check
                if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                    # If the basic check returns no relevant data, we queue up a proper check on the URL
                    self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug('LinkChecker: Removing url {0} from cache'.format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug('LinkChecker: Caching url {0} as {1}'.format(url, 'SAFE' if safe is True else 'UNSAFE'))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self, url, action=None, want_to_cache=True, want_to_blacklist=False):
        log.debug('LinkChecker: BAD URL FOUND {0}'.format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)
            return True

    def blacklist_url(self, url, parsed_url=None, level=0):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        self.db_session.commit()

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith('http://') or url.lower().startswith('https://')):
            url = 'http://' + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith('www.'):
            domain = domain[4:]
        if path.endswith('/'):
            path = path[:-1]
        if path == '':
            path = '/'

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)
        self.db_session.commit()

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif link.level >= 1:  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == '':
            path = '/'

        domain_split = domain.split('.')
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug('LinkChecker: Url {0} found in cache'.format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info('Checking if link is blacklisted...')
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug('LinkChecker: Url {0} is blacklisted'.format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info('Checking if link is whitelisted...')
        if self.is_whitelisted(url.url, url.parsed):
            log.debug('LinkChecker: Url {0} allowed by the whitelist'.format(url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split('.')) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception('LinkChecker unhanled exception while _check_url')

    def _check_url(self, url, action):
        log.debug('LinkChecker: Checking url {0}'.format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = ('content-type' in r.headers and r.headers['content-type'] == 'application/octet-stream')
        checkdispotype = ('disposition-type' in r.headers and r.headers['disposition-type'] == 'attachment')

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                log.debug('Bad url because google api')
                self.counteract_bad_url(url, action, want_to_blacklist=False)
                self.counteract_bad_url(redirected_url, want_to_blacklist=False)
                return

        if 'content-type' not in r.headers or not r.headers['content-type'].startswith('text/html'):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ''
        try:
            response = requests.get(url=url.url, stream=True, timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get('Content-Length')
            if content_length and int(response.headers.get('Content-Length')) > maximum_size:
                log.error('This file is too big!')
                return

            size = 0
            start = time.time()

            for chunk in response.iter_content(1024):
                if time.time() - start > receive_timeout:
                    log.error('The site took too long to load')
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error('This file is too big! (fake header)')
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning('Connection timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning('Reading timed out while checking {0}'.format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception('Unhandled exception')
            return

        try:
            soup = BeautifulSoup(html, 'html.parser')
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all('a'):  # get a list of links to external sites
            url = link.get('href')
            if url is None:
                continue
            if url.startswith('//'):
                urls.append('http:' + url)
            elif url.startswith('http://') or url.startswith('https://'):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug('Skipping because internal link')
                continue

            log.debug('Checking sublink {0}'.format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url, allow_redirects=True, timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url, want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url, want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(redirected_url.url):  # harmful url detected
                    log.debug('Evil sublink {0} by google API'.format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands['add'] = pajbot.models.command.Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='add',
                commands={
                    'link': pajbot.models.command.Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': pajbot.models.command.Command.raw_command(self.add_link_blacklist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Blacklist a link',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Add a link to the blacklist for a shallow search',
                                        chat='user:!add link blacklist --shallow scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a shallow search').parse(),
                                    pajbot.models.command.CommandExample(None, 'Add a link to the blacklist for a deep search',
                                        chat='user:!add link blacklist --deep scamlink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link scamlink.lonk/ to the blacklist for a deep search').parse(),
                                    ]),
                            'whitelist': pajbot.models.command.Command.raw_command(self.add_link_whitelist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Whitelist a link',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Add a link to the whitelist',
                                        chat='user:!add link whitelink safelink.lonk/\n'
                                        'bot>user:Successfully added your links',
                                        description='Added the link safelink.lonk/ to the whitelist').parse(),
                                    ]),
                            }
                        )
                    }
                )

        self.commands['remove'] = pajbot.models.command.Command.multiaction_command(
                level=100,
                delay_all=0,
                delay_user=0,
                default=None,
                command='remove',
                commands={
                    'link': pajbot.models.command.Command.multiaction_command(
                        level=500,
                        delay_all=0,
                        delay_user=0,
                        default=None,
                        commands={
                            'blacklist': pajbot.models.command.Command.raw_command(self.remove_link_blacklist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Remove a link from the blacklist.',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Remove a link from the blacklist.',
                                        chat='user:!remove link blacklist 20\n'
                                        'bot>user:Successfully removed blacklisted link with id 20',
                                        description='Remove a link from the blacklist with an ID').parse(),
                                    ]),
                            'whitelist': pajbot.models.command.Command.raw_command(self.remove_link_whitelist,
                                level=500,
                                delay_all=0,
                                delay_user=0,
                                description='Remove a link from the whitelist.',
                                examples=[
                                    pajbot.models.command.CommandExample(None, 'Remove a link from the whitelist.',
                                        chat='user:!remove link whitelist 12\n'
                                        'bot>user:Successfully removed blacklisted link with id 12',
                                        description='Remove a link from the whitelist with an ID').parse(),
                                    ]),
                            }
                        ),
                    }
                )

    def add_link_blacklist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        options, new_links = self.parse_link_blacklist_arguments(message)

        if new_links:
            parts = new_links.split(' ')
            try:
                for link in parts:
                    if len(link) > 1:
                        self.blacklist_url(link, **options)
                        AdminLogManager.post('Blacklist link added', source, link)
                bot.whisper(source.username, 'Successfully added your links')
                return True
            except:
                log.exception('Unhandled exception in add_link_blacklist')
                bot.whisper(source.username, 'Some error occurred while adding your links')
                return False
        else:
            bot.whisper(source.username, 'Usage: !add link blacklist LINK')
            return False

    def add_link_whitelist(self, **options):
        bot = options['bot']
        message = options['message']
        source = options['source']

        parts = message.split(' ')
        try:
            for link in parts:
                self.whitelist_url(link)
                AdminLogManager.post('Whitelist link added', source, link)
        except:
            log.exception('Unhandled exception in add_link')
            bot.whisper(source.username, 'Some error occurred white adding your links')
            return False

        bot.whisper(source.username, 'Successfully added your links')

    def remove_link_blacklist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(BlacklistedLink).filter_by(id=id).one_or_none()

            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Blacklist link removed', source, link.domain)
            bot.whisper(source.username, 'Successfully removed blacklisted link with id {0}'.format(link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link blacklist ID')
            return False

    def remove_link_whitelist(self, **options):
        message = options['message']
        bot = options['bot']
        source = options['source']

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(WhitelistedLink).filter_by(id=id).one_or_none()

            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, 'No link with the given id found')
                return False

            AdminLogManager.post('Whitelist link removed', source, link.domain)
            bot.whisper(source.username, 'Successfully removed whitelisted link with id {0}'.format(link.id))
        else:
            bot.whisper(source.username, 'Usage: !remove link whitelist ID')
            return False

    def parse_link_blacklist_arguments(self, message):
        parser = argparse.ArgumentParser()
        parser.add_argument('--deep', dest='level', action='store_true')
        parser.add_argument('--shallow', dest='level', action='store_false')
        parser.set_defaults(level=False)

        try:
            args, unknown = parser.parse_known_args(message.split())
        except SystemExit:
            return False, False
        except:
            log.exception('Unhandled exception in add_link_blacklist')
            return False, False

        # Strip options of any values that are set as None
        options = {k: v for k, v in vars(args).items() if v is not None}
        response = ' '.join(unknown)

        return options, response

Example #7

Show file

File: linkchecker.py Project: streptocarcus/bullbot

class LinkCheckerModule(BaseModule):

    ID = __name__.split(".")[-1]
    NAME = "Link Checker"
    DESCRIPTION = "Checks links if they're bad"
    ENABLED_DEFAULT = True
    CATEGORY = "Filter"
    SETTINGS = [
        ModuleSetting(
            key="ban_pleb_links",
            label="Disallow links from non-subscribers",
            type="boolean",
            required=True,
            default=False,
        ),
        ModuleSetting(key="ban_sub_links",
                      label="Disallow links from subscribers",
                      type="boolean",
                      required=True,
                      default=False),
        ModuleSetting(
            key="timeout_length",
            label="Timeout length",
            type="number",
            required=True,
            placeholder="Timeout length in seconds",
            default=60,
            constraints={
                "min_value": 1,
                "max_value": 3600
            },
        ),
    ]

    def __init__(self, bot):
        super().__init__(bot)
        self.db_session = None
        self.links = {}

        self.blacklisted_links = []
        self.whitelisted_links = []

        self.cache = LinkCheckerCache(
        )  # cache[url] = True means url is safe, False means the link is bad

        self.action_queue = ActionQueue()
        self.action_queue.start()

    def enable(self, bot):
        HandlerManager.add_handler("on_message", self.on_message, priority=100)
        HandlerManager.add_handler("on_commit", self.on_commit)
        if bot:
            self.run_later = bot.execute_delayed

            if "safebrowsingapi" in bot.config["main"]:
                # XXX: This should be loaded as a setting instead.
                # There needs to be a setting for settings to have them as "passwords"
                # so they're not displayed openly
                self.safeBrowsingAPI = SafeBrowsingAPI(
                    bot.config["main"]["safebrowsingapi"], bot.nickname,
                    bot.version)
            else:
                self.safeBrowsingAPI = None

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
        self.db_session = DBManager.create_session()
        self.blacklisted_links = []
        for link in self.db_session.query(BlacklistedLink):
            self.blacklisted_links.append(link)

        self.whitelisted_links = []
        for link in self.db_session.query(WhitelistedLink):
            self.whitelisted_links.append(link)

    def disable(self, bot):
        pajbot.managers.handler.HandlerManager.remove_handler(
            "on_message", self.on_message)
        pajbot.managers.handler.HandlerManager.remove_handler(
            "on_commit", self.on_commit)

        if self.db_session is not None:
            self.db_session.commit()
            self.db_session.close()
            self.db_session = None
            self.blacklisted_links = []
            self.whitelisted_links = []

    def reload(self):

        log.info("Loaded {0} bad links and {1} good links".format(
            len(self.blacklisted_links), len(self.whitelisted_links)))
        return self

    super_whitelist = ["pajlada.se", "pajlada.com", "forsen.tv", "pajbot.com"]

    def on_message(self, source, whisper, urls, **rest):
        if whisper:
            return

        if source.level >= 500 or source.moderator is True:
            return

        if len(urls) > 0:
            do_timeout = False
            ban_reason = "You are not allowed to post links in chat"
            whisper_reason = "??? KKona"

            if self.settings[
                    "ban_pleb_links"] is True and source.subscriber is False:
                do_timeout = True
                whisper_reason = "You cannot post non-verified links in chat if you're not a subscriber."
            elif self.settings[
                    "ban_sub_links"] is True and source.subscriber is True:
                do_timeout = True
                whisper_reason = "You cannot post non-verified links in chat."

            if do_timeout is True:
                # Check if the links are in our super-whitelist. i.e. on the pajlada.se domain o forsen.tv
                for url in urls:
                    parsed_url = Url(url)
                    if len(parsed_url.parsed.netloc.split(".")) < 2:
                        continue
                    whitelisted = False
                    for whitelist in self.super_whitelist:
                        if is_subdomain(parsed_url.parsed.netloc, whitelist):
                            whitelisted = True
                            break
                    if whitelisted is False:
                        self.bot.timeout(source.username,
                                         30,
                                         reason=ban_reason)
                        if source.minutes_in_chat_online > 60:
                            self.bot.whisper(source.username, whisper_reason)
                        return False

        for url in urls:
            # Action which will be taken when a bad link is found
            action = Action(
                self.bot.timeout,
                args=[source.username, self.settings["timeout_length"]],
                kwargs={"reason": "Banned link"},
            )
            # First we perform a basic check
            if self.simple_check(url, action) == self.RET_FURTHER_ANALYSIS:
                # If the basic check returns no relevant data, we queue up a proper check on the URL
                self.action_queue.add(self.check_url, args=[url, action])

    def on_commit(self, **rest):
        if self.db_session is not None:
            self.db_session.commit()

    def delete_from_cache(self, url):
        if url in self.cache:
            log.debug("LinkChecker: Removing url {0} from cache".format(url))
            del self.cache[url]

    def cache_url(self, url, safe):
        if url in self.cache and self.cache[url] == safe:
            return

        log.debug("LinkChecker: Caching url {0} as {1}".format(
            url, "SAFE" if safe is True else "UNSAFE"))
        self.cache[url] = safe
        self.run_later(20, self.delete_from_cache, (url, ))

    def counteract_bad_url(self,
                           url,
                           action=None,
                           want_to_cache=True,
                           want_to_blacklist=False):
        log.debug("LinkChecker: BAD URL FOUND {0}".format(url.url))
        if action:
            action.run()
        if want_to_cache:
            self.cache_url(url.url, False)
        if want_to_blacklist:
            self.blacklist_url(url.url, url.parsed)
            return True

    def blacklist_url(self, url, parsed_url=None, level=0):
        if not (url.lower().startswith("http://")
                or url.lower().startswith("https://")):
            url = "http://" + url

        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)

        if self.is_blacklisted(url, parsed_url):
            return False

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith("www."):
            domain = domain[4:]
        if path.endswith("/"):
            path = path[:-1]
        if path == "":
            path = "/"

        link = BlacklistedLink(domain, path, level)
        self.db_session.add(link)
        self.blacklisted_links.append(link)
        self.db_session.commit()

    def whitelist_url(self, url, parsed_url=None):
        if not (url.lower().startswith("http://")
                or url.lower().startswith("https://")):
            url = "http://" + url
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        if self.is_whitelisted(url, parsed_url):
            return

        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()

        if domain.startswith("www."):
            domain = domain[4:]
        if path.endswith("/"):
            path = path[:-1]
        if path == "":
            path = "/"

        link = WhitelistedLink(domain, path)
        self.db_session.add(link)
        self.whitelisted_links.append(link)
        self.db_session.commit()

    def is_blacklisted(self, url, parsed_url=None, sublink=False):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == "":
            path = "/"

        domain_split = domain.split(".")
        if len(domain_split) < 2:
            return False

        for link in self.blacklisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    if not sublink:
                        return True
                    elif (
                            link.level >= 1
                    ):  # if it's a sublink, but the blacklisting level is 0, we don't consider it blacklisted
                        return True

        return False

    def is_whitelisted(self, url, parsed_url=None):
        if parsed_url is None:
            parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.netloc.lower()
        path = parsed_url.path.lower()
        if path == "":
            path = "/"

        domain_split = domain.split(".")
        if len(domain_split) < 2:
            return False

        for link in self.whitelisted_links:
            if link.is_subdomain(domain):
                if link.is_subpath(path):
                    return True

        return False

    RET_BAD_LINK = -1
    RET_FURTHER_ANALYSIS = 0
    RET_GOOD_LINK = 1

    def basic_check(self, url, action, sublink=False):
        """
        Check if the url is in the cache, or if it's
        Return values:
        1 = Link is OK
        -1 = Link is bad
        0 = Link needs further analysis
        """
        if url.url in self.cache:
            log.debug("LinkChecker: Url {0} found in cache".format(url.url))
            if not self.cache[url.url]:  # link is bad
                self.counteract_bad_url(url, action, False, False)
                return self.RET_BAD_LINK
            return self.RET_GOOD_LINK

        log.info("Checking if link is blacklisted...")
        if self.is_blacklisted(url.url, url.parsed, sublink):
            log.debug("LinkChecker: Url {0} is blacklisted".format(url.url))
            self.counteract_bad_url(url, action, want_to_blacklist=False)
            return self.RET_BAD_LINK

        log.info("Checking if link is whitelisted...")
        if self.is_whitelisted(url.url, url.parsed):
            log.debug("LinkChecker: Url {0} allowed by the whitelist".format(
                url.url))
            self.cache_url(url.url, True)
            return self.RET_GOOD_LINK

        return self.RET_FURTHER_ANALYSIS

    def simple_check(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split(".")) < 2:
            # The URL is broken, ignore it
            return self.RET_FURTHER_ANALYSIS

        return self.basic_check(url, action)

    def check_url(self, url, action):
        url = Url(url)
        if len(url.parsed.netloc.split(".")) < 2:
            # The URL is broken, ignore it
            return

        try:
            self._check_url(url, action)
        except:
            log.exception("LinkChecker unhanled exception while _check_url")

    def _check_url(self, url, action):
        log.debug("LinkChecker: Checking url {0}".format(url.url))

        # XXX: The basic check is currently performed twice on links found in messages. Solve
        res = self.basic_check(url, action)
        if res == self.RET_GOOD_LINK:
            return
        elif res == self.RET_BAD_LINK:
            return

        connection_timeout = 2
        read_timeout = 1
        try:
            r = requests.head(url.url,
                              allow_redirects=True,
                              timeout=connection_timeout)
        except:
            self.cache_url(url.url, True)
            return

        checkcontenttype = "content-type" in r.headers and r.headers[
            "content-type"] == "application/octet-stream"
        checkdispotype = "disposition-type" in r.headers and r.headers[
            "disposition-type"] == "attachment"

        if checkcontenttype or checkdispotype:  # triggering a download not allowed
            self.counteract_bad_url(url, action)
            return

        redirected_url = Url(r.url)
        if is_same_url(url, redirected_url) is False:
            res = self.basic_check(redirected_url, action)
            if res == self.RET_GOOD_LINK:
                return
            elif res == self.RET_BAD_LINK:
                return

        if self.safeBrowsingAPI:
            if self.safeBrowsingAPI.check_url(
                    redirected_url.url):  # harmful url detected
                log.debug("Bad url because google api")
                self.counteract_bad_url(url, action, want_to_blacklist=False)
                self.counteract_bad_url(redirected_url,
                                        want_to_blacklist=False)
                return

        if "content-type" not in r.headers or not r.headers[
                "content-type"].startswith("text/html"):
            return  # can't analyze non-html content
        maximum_size = 1024 * 1024 * 10  # 10 MB
        receive_timeout = 3

        html = ""
        try:
            response = requests.get(url=url.url,
                                    stream=True,
                                    timeout=(connection_timeout, read_timeout))

            content_length = response.headers.get("Content-Length")
            if content_length and int(
                    response.headers.get("Content-Length")) > maximum_size:
                log.error("This file is too big!")
                return

            size = 0
            start = pajbot.utils.now().timestamp()

            for chunk in response.iter_content(1024):
                if pajbot.utils.now().timestamp() - start > receive_timeout:
                    log.error("The site took too long to load")
                    return

                size += len(chunk)
                if size > maximum_size:
                    log.error("This file is too big! (fake header)")
                    return
                html += str(chunk)

        except requests.exceptions.ConnectTimeout:
            log.warning("Connection timed out while checking {0}".format(
                url.url))
            self.cache_url(url.url, True)
            return
        except requests.exceptions.ReadTimeout:
            log.warning("Reading timed out while checking {0}".format(url.url))
            self.cache_url(url.url, True)
            return
        except:
            log.exception("Unhandled exception")
            return

        try:
            soup = BeautifulSoup(html, "html.parser")
        except:
            return

        original_url = url
        original_redirected_url = redirected_url
        urls = []
        for link in soup.find_all(
                "a"):  # get a list of links to external sites
            url = link.get("href")
            if url is None:
                continue
            if url.startswith("//"):
                urls.append("http:" + url)
            elif url.startswith("http://") or url.startswith("https://"):
                urls.append(url)

        for url in urls:  # check if the site links to anything dangerous
            url = Url(url)

            if is_subdomain(url.parsed.netloc, original_url.parsed.netloc):
                # log.debug('Skipping because internal link')
                continue

            log.debug("Checking sublink {0}".format(url.url))
            res = self.basic_check(url, action, sublink=True)
            if res == self.RET_BAD_LINK:
                self.counteract_bad_url(url)
                self.counteract_bad_url(original_url, want_to_blacklist=False)
                self.counteract_bad_url(original_redirected_url,
                                        want_to_blacklist=False)
                return
            elif res == self.RET_GOOD_LINK:
                continue

            try:
                r = requests.head(url.url,
                                  allow_redirects=True,
                                  timeout=connection_timeout)
            except:
                continue

            redirected_url = Url(r.url)
            if not is_same_url(url, redirected_url):
                res = self.basic_check(redirected_url, action, sublink=True)
                if res == self.RET_BAD_LINK:
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(original_url,
                                            want_to_blacklist=False)
                    self.counteract_bad_url(original_redirected_url,
                                            want_to_blacklist=False)
                    return
                elif res == self.RET_GOOD_LINK:
                    continue

            if self.safeBrowsingAPI:
                if self.safeBrowsingAPI.check_url(
                        redirected_url.url):  # harmful url detected
                    log.debug("Evil sublink {0} by google API".format(url))
                    self.counteract_bad_url(original_url, action)
                    self.counteract_bad_url(original_redirected_url)
                    self.counteract_bad_url(url)
                    self.counteract_bad_url(redirected_url)
                    return

        # if we got here, the site is clean for our standards
        self.cache_url(original_url.url, True)
        self.cache_url(original_redirected_url.url, True)
        return

    def load_commands(self, **options):
        self.commands["add"] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command="add",
            commands={
                "link":
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        "blacklist":
                        Command.raw_command(
                            self.add_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Blacklist a link",
                            examples=[
                                CommandExample(
                                    None,
                                    "Add a link to the blacklist for a shallow search",
                                    chat=
                                    "user:!add link blacklist --shallow scamlink.lonk/\n"
                                    "bot>user:Successfully added your links",
                                    description=
                                    "Added the link scamlink.lonk/ to the blacklist for a shallow search",
                                ).parse(),
                                CommandExample(
                                    None,
                                    "Add a link to the blacklist for a deep search",
                                    chat=
                                    "user:!add link blacklist --deep scamlink.lonk/\n"
                                    "bot>user:Successfully added your links",
                                    description=
                                    "Added the link scamlink.lonk/ to the blacklist for a deep search",
                                ).parse(),
                            ],
                        ),
                        "whitelist":
                        Command.raw_command(
                            self.add_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Whitelist a link",
                            examples=[
                                CommandExample(
                                    None,
                                    "Add a link to the whitelist",
                                    chat=
                                    "user:!add link whitelink safelink.lonk/\n"
                                    "bot>user:Successfully added your links",
                                    description=
                                    "Added the link safelink.lonk/ to the whitelist",
                                ).parse()
                            ],
                        ),
                    },
                )
            },
        )

        self.commands["remove"] = Command.multiaction_command(
            level=100,
            delay_all=0,
            delay_user=0,
            default=None,
            command="remove",
            commands={
                "link":
                Command.multiaction_command(
                    level=500,
                    delay_all=0,
                    delay_user=0,
                    default=None,
                    commands={
                        "blacklist":
                        Command.raw_command(
                            self.remove_link_blacklist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Remove a link from the blacklist.",
                            examples=[
                                CommandExample(
                                    None,
                                    "Remove a link from the blacklist.",
                                    chat="user:!remove link blacklist 20\n"
                                    "bot>user:Successfully removed blacklisted link with id 20",
                                    description=
                                    "Remove a link from the blacklist with an ID",
                                ).parse()
                            ],
                        ),
                        "whitelist":
                        Command.raw_command(
                            self.remove_link_whitelist,
                            level=500,
                            delay_all=0,
                            delay_user=0,
                            description="Remove a link from the whitelist.",
                            examples=[
                                CommandExample(
                                    None,
                                    "Remove a link from the whitelist.",
                                    chat="user:!remove link whitelist 12\n"
                                    "bot>user:Successfully removed blacklisted link with id 12",
                                    description=
                                    "Remove a link from the whitelist with an ID",
                                ).parse()
                            ],
                        ),
                    },
                )
            },
        )

    def add_link_blacklist(self, **options):
        bot = options["bot"]
        message = options["message"]
        source = options["source"]

        options, new_links = self.parse_link_blacklist_arguments(message)

        if new_links:
            parts = new_links.split(" ")
            try:
                for link in parts:
                    if len(link) > 1:
                        self.blacklist_url(link, **options)
                        AdminLogManager.post("Blacklist link added", source,
                                             link)
                bot.whisper(source.username, "Successfully added your links")
                return True
            except:
                log.exception("Unhandled exception in add_link_blacklist")
                bot.whisper(source.username,
                            "Some error occurred while adding your links")
                return False
        else:
            bot.whisper(source.username, "Usage: !add link blacklist LINK")
            return False

    def add_link_whitelist(self, **options):
        bot = options["bot"]
        message = options["message"]
        source = options["source"]

        parts = message.split(" ")
        try:
            for link in parts:
                self.whitelist_url(link)
                AdminLogManager.post("Whitelist link added", source, link)
        except:
            log.exception("Unhandled exception in add_link")
            bot.whisper(source.username,
                        "Some error occurred white adding your links")
            return False

        bot.whisper(source.username, "Successfully added your links")

    def remove_link_blacklist(self, **options):
        message = options["message"]
        bot = options["bot"]
        source = options["source"]

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(BlacklistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.blacklisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, "No link with the given id found")
                return False

            AdminLogManager.post("Blacklist link removed", source, link.domain)
            bot.whisper(
                source.username,
                "Successfully removed blacklisted link with id {0}".format(
                    link.id))
        else:
            bot.whisper(source.username, "Usage: !remove link blacklist ID")
            return False

    def remove_link_whitelist(self, **options):
        message = options["message"]
        bot = options["bot"]
        source = options["source"]

        if message:
            id = None
            try:
                id = int(message)
            except ValueError:
                pass

            link = self.db_session.query(WhitelistedLink).filter_by(
                id=id).one_or_none()

            if link:
                self.whitelisted_links.remove(link)
                self.db_session.delete(link)
                self.db_session.commit()
            else:
                bot.whisper(source.username, "No link with the given id found")
                return False

            AdminLogManager.post("Whitelist link removed", source, link.domain)
            bot.whisper(
                source.username,
                "Successfully removed whitelisted link with id {0}".format(
                    link.id))
        else:
            bot.whisper(source.username, "Usage: !remove link whitelist ID")
            return False

    @staticmethod
    def parse_link_blacklist_arguments(message):
        parser = argparse.ArgumentParser()
        parser.add_argument("--deep", dest="level", action="store_true")
        parser.add_argument("--shallow", dest="level", action="store_false")
        parser.set_defaults(level=False)

        try:
            args, unknown = parser.parse_known_args(message.split())
        except SystemExit:
            return False, False
        except:
            log.exception("Unhandled exception in add_link_blacklist")
            return False, False

        # Strip options of any values that are set as None
        options = {k: v for k, v in vars(args).items() if v is not None}
        response = " ".join(unknown)

        return options, response