Esempio n. 1
0
    def recv_info(self, info):

        m_parsed_url = info.parsed_url
        m_results = []

        #------------------------------------------------------------------
        # Find suspicious URLs by matching against known substrings.

        # Load wordlists
        m_wordlist_middle     = WordListLoader.get_wordlist(Config.plugin_config['middle'])
        m_wordlist_extensions = WordListLoader.get_wordlist(Config.plugin_config['extensions'])

        # Add matching keywords at any positions of URL.
        m_results.extend([SuspiciousURLPath(info, x)
                          for x in m_wordlist_middle
                          if x in m_parsed_url.directory.split("/") or
                          x == m_parsed_url.filebase or
                          x == m_parsed_url.extension])

        # Add matching keywords at any positions of URL.
        m_results.extend([SuspiciousURLPath(info, x)
                          for x in m_wordlist_extensions
                          if m_parsed_url.extension == x])

        #------------------------------------------------------------------
        # Find suspicious URLs by calculating the Shannon entropy of the hostname.
        # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py
        # TODO: test with unicode enabled hostnames!

        # Check the Shannon entropy for the hostname.
        hostname = info.parsed_url.hostname
        entropy = calculate_shannon_entropy(hostname)
        if entropy > 4.0:
            m_results.append( SuspiciousURLPath(info, hostname) )

        # Check the Shannon entropy for the subdomains.
        for subdomain in info.parsed_url.hostname.split('.'):
            if len(subdomain) > 3:
                entropy = calculate_shannon_entropy(subdomain)
                if entropy > 4.0:
                    m_results.append( SuspiciousURLPath(info, subdomain) )

        #------------------------------------------------------------------
        #
        #
        #
        # Get malware suspicious links
        #
        #
        #
        #------------------------------------------------------------------
        p     = None
        m_url = info.url
        Logger.log_more_verbose("Looking for output links to malware sites")
        try:
            allow_redirects = Config.audit_config.follow_redirects or \
                             (info.depth == 0 and Config.audit_config.follow_first_redirect)
            p = download(m_url, self.check_download, allow_redirects=allow_redirects)
        except NetworkException,e:
            Logger.log_more_verbose("Error while processing %r: %s" % (m_url, str(e)))
Esempio n. 2
0
    def recv_info(self, info):

        m_parsed_url = info.parsed_url
        m_results = []

        #------------------------------------------------------------------
        # Find suspicious URLs by matching against known substrings.

        # Load wordlists
        m_wordlist_middle = WordListLoader.get_wordlist(
            Config.plugin_config['middle'])
        m_wordlist_extensions = WordListLoader.get_wordlist(
            Config.plugin_config['extensions'])

        # Add matching keywords at any positions of URL.
        m_results.extend([
            SuspiciousURL(info, x) for x in m_wordlist_middle
            if x in m_parsed_url.directory.split("/")
            or x == m_parsed_url.filebase or x == m_parsed_url.extension
        ])

        # Add matching keywords at any positions of URL.
        m_results.extend([
            SuspiciousURL(info, x) for x in m_wordlist_extensions
            if m_parsed_url.extension == x
        ])

        #------------------------------------------------------------------
        # Find suspicious URLs by calculating the Shannon entropy of the hostname.
        # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py
        # TODO: test with unicode enabled hostnames!

        # Check the Shannon entropy for the hostname.
        hostname = info.parsed_url.hostname
        entropy = calculate_shannon_entropy(hostname)
        if entropy > 4.0:
            m_results.append(SuspiciousURL(info, hostname))

        # Check the Shannon entropy for the subdomains.
        for subdomain in info.parsed_url.hostname.split('.'):
            if len(subdomain) > 3:
                entropy = calculate_shannon_entropy(subdomain)
                if entropy > 4.0:
                    m_results.append(SuspiciousURL(info, subdomain))

        #------------------------------------------------------------------

        return m_results
def load_wordlists(wordlists):
    """
    Load the with names pased as parameter.

    This function receives a list of names of wordlist, defined in plugin
    configuration file, and return a dict with instances of wordlists.

    :param wordlists: list with wordlists names
    :type wordlists: list

    :returns: A dict with wordlists
    :rtype: dict
    """

    m_tmp_wordlist = {}

    # Get wordlist to load
    for l_w in wordlists:
        for wordlist_family, l_wordlists in Config.plugin_extra_config.iteritems():
            if wordlist_family.lower() in l_w.lower():
                m_tmp_wordlist[l_w] = l_wordlists

    # Load the wordlist
    m_return = {}
    for k, w_paths in m_tmp_wordlist.iteritems():
        m_return[k] = [WordListLoader.get_wordlist(w) for w in w_paths]

    return m_return
Esempio n. 4
0
def load_wordlists(wordlists):
    """
    Load the with names pased as parameter.

    This function receives a list of names of wordlist, defined in plugin
    configuration file, and return a dict with instances of wordlists.

    :param wordlists: list with wordlists names
    :type wordlists: list

    :returns: A dict with wordlists
    :rtype: dict
    """

    m_tmp_wordlist = {}

    # Get wordlist to load
    for l_w in wordlists:
        for wordlist_family, l_wordlists in Config.plugin_extra_config.iteritems():
            if wordlist_family.lower() in l_w.lower():
                m_tmp_wordlist[l_w] = l_wordlists

    # Load the wordlist
    m_return = {}
    for k, w_paths in m_tmp_wordlist.iteritems():
        m_return[k] = [WordListLoader.get_wordlist(w) for w in w_paths]

    return m_return
Esempio n. 5
0
    def analyze_url(self, info):

        m_parsed_url = info.parsed_url
        m_results = []

        Logger.log_more_verbose("Processing URL: %s" % m_parsed_url)


        #----------------------------------------------------------------------
        # Find suspicious URLs by matching against known substrings.

        # Load wordlists
        m_wordlist_middle     = WordListLoader.get_wordlist(Config.plugin_config['middle'])
        m_wordlist_extensions = WordListLoader.get_wordlist(Config.plugin_config['extensions'])

        # Add matching keywords at any positions of URL.
        m_results.extend([SuspiciousURLPath(info, x)
                          for x in m_wordlist_middle
                          if x in m_parsed_url.directory.split("/") or
                          x == m_parsed_url.filebase or
                          x == m_parsed_url.extension])

        # Add matching keywords at any positions of URL.
        m_results.extend([SuspiciousURLPath(info, x)
                          for x in m_wordlist_extensions
                          if m_parsed_url.extension == x])


        #----------------------------------------------------------------------
        # Find suspicious URLs by calculating the Shannon entropy of the hostname.
        # Idea from: https://github.com/stricaud/urlweirdos/blob/master/src/urlw/plugins/shannon/__init__.py
        # TODO: test with unicode enabled hostnames!

        # Check the Shannon entropy for the hostname.
        hostname = info.parsed_url.hostname
        entropy = calculate_shannon_entropy(hostname)
        if entropy > 4.0:
            m_results.append( SuspiciousURLPath(info, hostname) )

        # Check the Shannon entropy for the subdomains.
        for subdomain in info.parsed_url.hostname.split('.'):
            if len(subdomain) > 3:
                entropy = calculate_shannon_entropy(subdomain)
                if entropy > 4.0:
                    m_results.append( SuspiciousURLPath(info, subdomain) )

        return m_results
Esempio n. 6
0
    def __detect_wordpress_installation(self, url, wordpress_urls):
        """
        Try to detect a wordpress instalation in the current path.

        :param url: URL where try to find the WordPress installation.
        :type url: str

        :param wordpress_urls: string with wordlist name with WordPress URLs.
        :type wordpress_urls: str

        :return: True if wordpress installation found. False otherwise.
        :rtype: bool
        """
        Logger.log_more_verbose(
            "Detecting Wordpress instalation in URI: '%s'." % url)
        total_urls = 0
        urls_found = 0

        error_page = get_error_page(url).raw_data

        for u in WordListLoader.get_wordlist(wordpress_urls):
            total_urls += 1
            tmp_url = urljoin(url, u)

            r = HTTP.get_url(tmp_url, use_cache=False)
            if r.status == "200":

                # Try to detect non-default error pages
                ratio = get_diff_ratio(r.raw_response, error_page)
                if ratio < 0.35:
                    urls_found += 1

            discard_data(r)

        # If Oks > 85% continue
        if (urls_found / float(total_urls)) < 0.85:

            # If all fails, make another last test
            url_wp_admin = urljoin(url, "wp-admin/")

            try:
                p = HTTP.get_url(url_wp_admin,
                                 use_cache=False,
                                 allow_redirects=False)
                if p:
                    discard_data(p)
            except Exception, e:
                return False

            if p.status == "302" and "wp-login.php?redirect_to=" in p.headers.get(
                    "Location", ""):
                return True
            else:
                return False
Esempio n. 7
0
    def __detect_wordpress_installation(self, url, wordpress_urls):
        """
        Try to detect a wordpress instalation in the current path.

        :param url: URL where try to find the WordPress installation.
        :type url: str

        :param wordpress_urls: string with wordlist name with WordPress URLs.
        :type wordpress_urls: str

        :return: True if wordpress installation found. False otherwise.
        :rtype: bool
        """
        Logger.log_more_verbose("Detecting Wordpress instalation in URI: '%s'." % url)
        total_urls = 0
        urls_found = 0

        error_page = get_error_page(url).raw_data

        for u in WordListLoader.get_wordlist(wordpress_urls):
            total_urls += 1
            tmp_url = urljoin(url, u)

            r = HTTP.get_url(tmp_url, use_cache=False)
            if r.status == "200":

                # Try to detect non-default error pages
                ratio = get_diff_ratio(r.raw_response, error_page)
                if ratio < 0.35:
                    urls_found += 1

            discard_data(r)

        # If Oks > 85% continue
        if (urls_found / float(total_urls)) < 0.85:

            # If all fails, make another last test
            url_wp_admin = urljoin(url, "wp-admin/")

            try:
                p = HTTP.get_url(url_wp_admin, use_cache=False, allow_redirects=False)
                if p:
                    discard_data(p)
            except Exception, e:
                return False

            if p.status == "302" and "wp-login.php?redirect_to=" in p.headers.get("Location", ""):
                return True
            else:
                return False
Esempio n. 8
0
    def analyze_html(self, info):


        #----------------------------------------------------------------------
        # Get malware suspicious links.

        Logger.log_more_verbose("Processing HTML: %s" % info.identity)

        # Load the malware wordlist.
        wordlist_filename = Config.plugin_config["malware_sites"]
        try:
            wordlist = WordListLoader.get_advanced_wordlist_as_list(
                wordlist_filename)
        except WordlistNotFound:
            Logger.log_error("Wordlist '%s' not found.." % wordlist_filename)
            return
        except TypeError:
            Logger.log_error(
                "Wordlist '%s' is not a file." % wordlist_filename)
            return
        if not wordlist:
            Logger.log_error("Wordlist '%s' is empty." % wordlist_filename)

        Logger.log("1")

        # Get links
        base_urls = set()
        for url in info.find_linked_data(Data.TYPE_RESOURCE,
                                         Resource.RESOURCE_URL):
            m_url = url.url
            base_urls.add(m_url)
            if info.information_type == Information.INFORMATION_HTML:
                m_links = extract_from_html(info.raw_data, m_url)
                m_links.update( extract_from_text(info.raw_data, m_url) )
            elif info.information_type == Information.INFORMATION_PLAIN_TEXT:
                m_links = extract_from_text(info.raw_data, m_url)
            else:
                raise Exception("Internal error!")
        m_links.difference_update(base_urls)

        Logger.log("2")

        # If we have no links, abort now
        if not m_links:
            Logger.log_verbose("No output links found.")
            return

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = {
            url for url in m_links
            if url and not any(x in url for x in m_forbidden)
        }

        Logger.log("3")

        # Get only output links
        m_output_links = []
        for url in m_urls_allowed:
            try:
                if url not in Config.audit_scope:
                    m_output_links.append(url)
            except Exception, e:
                Logger.log_error_more_verbose(format_exc())
Esempio n. 9
0
    def check_download(self, url, name, content_length, content_type):

        # Only accept content when the content type header is present.
        if not content_type:
            Logger.log_more_verbose(
                "Skipping URL, missing content type: %s" % url)
            return False

        # Is the content length present?
        if content_length is not None:

            # Check the file doesn't have 0 bytes.
            if content_length <= 0:
                Logger.log_more_verbose(
                    "Skipping URL, empty content: %s" % url)
                return False

            # Check the file is not too big.
            if content_type.strip().lower().startswith("text/"):
                if content_length > 100000:
                    Logger.log_more_verbose(
                        "Skipping URL, content too large (%d bytes): %s"
                        % (content_length, url))
                    return False
            else:
                if content_length > 5000000:
                    Logger.log_more_verbose(
                        "Skipping URL, content too large (%d bytes): %s"
                        % (content_length, url))
                    return False

            # Approved!
            return True

        # Content length absent but likely points to a directory index.
        parsed_url = parse_url(url)
        if not parsed_url.filename:

            # Approved!
            return True

        # Extension absent.
        if not parsed_url.extension:

            # Approved!
            return True

        # Match against a known list of valid HTML extensions.
        # See: http://en.wikipedia.org/wiki/List_of_file_formats#Webpage
        if parsed_url.extension in (
                ".xml", ".html", ".htm", ".xhtml", ".xht",
                ".mht", ".mhtml", ".maff", ".asp", ".aspx", ".bml",
                ".cfm", ".cgi", ".ihtml", ".jsp", ".las", ".lasso",
                ".lassoapp", ".pl", ".php", ".php3", ".phtml",
                ".rna", ".r", ".rnx", ".shtml", ".stm", ".atom",
                ".xml", ".eml", ".jsonld", ".metalink", ".met",
                ".rss", ".xml", ".markdown"):

            # Approved!
            return True

        # If URL path in blacklist?
        m_forbidden = [x for x in WordListLoader.get_wordlist(Config.plugin_config["wordlist_no_spider"])]
        if any(x in url for x in m_forbidden):
            return False

        # Success!
        return True
Esempio n. 10
0
class Spider(TestingPlugin):
    """
    This plugin is a web spider.
    """

    #----------------------------------------------------------------------
    def get_accepted_info(self):
        return [Url]

    #----------------------------------------------------------------------
    def recv_info(self, info):

        m_return = []

        m_url = info.url
        m_depth = info.depth

        # Check depth
        if Config.audit_config.depth is not None and m_depth > Config.audit_config.depth:
            Logger.log_more_verbose("Spider depth level exceeded for URL: %s" %
                                    m_url)
            return m_return

        Logger.log_verbose("Spidering URL: %r" % m_url)

        # Check if need follow first redirect
        p = None
        try:
            allow_redirects = Config.audit_config.follow_redirects or \
                             (m_depth == 0 and Config.audit_config.follow_first_redirect)
            p = download(m_url,
                         self.check_download,
                         allow_redirects=allow_redirects)
        except NetworkException, e:
            Logger.log_more_verbose("Error while processing %r: %s" %
                                    (m_url, str(e)))
        if not p:
            return m_return

        # Send back the data
        m_return.append(p)

        # TODO: If it's a 301 response, get the Location header

        # Get links
        if p.information_type == Information.INFORMATION_HTML:
            m_links = extract_from_html(p.raw_data, m_url)
        else:
            m_links = extract_from_text(p.raw_data, m_url)
        try:
            m_links.remove(m_url)
        except Exception:
            pass

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = [
            url for url in m_links if not any(x in url for x in m_forbidden)
        ]
        m_urls_not_allowed = m_links.difference(m_urls_allowed)
        if m_urls_not_allowed:
            Logger.log_more_verbose("Skipped forbidden URLs:\n    %s" %
                                    "\n    ".join(sorted(m_urls_not_allowed)))

        # Do not follow URLs out of scope
        m_out_of_scope_count = len(m_urls_allowed)
        m_urls_allowed = [
            url for url in m_urls_allowed if url in Config.audit_scope
        ]
        m_out_of_scope_count -= len(m_urls_allowed)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d links out of scope." %
                                    m_out_of_scope_count)

        if m_urls_allowed:
            Logger.log_verbose("Found %d links in URL: %s" %
                               (len(m_urls_allowed), m_url))
        else:
            Logger.log_verbose("No links found in URL: %s" % m_url)

        # Convert to Url data type
        for u in m_urls_allowed:
            m_resource = Url(url=u, depth=m_depth + 1, referer=m_url)
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Send the results
        return m_return
Esempio n. 11
0
    def analyze_html(self, info):

        #----------------------------------------------------------------------
        # Get malware suspicious links.

        Logger.log_more_verbose("Processing HTML: %s" % info.identity)

        # Load the malware wordlist.
        wordlist_filename = Config.plugin_config["malware_sites"]
        try:
            wordlist = WordListLoader.get_advanced_wordlist_as_list(
                wordlist_filename)
        except WordlistNotFound:
            Logger.log_error("Wordlist '%s' not found.." % wordlist_filename)
            return
        except TypeError:
            Logger.log_error("Wordlist '%s' is not a file." %
                             wordlist_filename)
            return
        if not wordlist:
            Logger.log_error("Wordlist '%s' is empty." % wordlist_filename)

        Logger.log("1")

        # Get links
        base_urls = set()
        for url in info.find_linked_data(Data.TYPE_RESOURCE,
                                         Resource.RESOURCE_URL):
            m_url = url.url
            base_urls.add(m_url)
            if info.information_type == Information.INFORMATION_HTML:
                m_links = extract_from_html(info.raw_data, m_url)
                m_links.update(extract_from_text(info.raw_data, m_url))
            elif info.information_type == Information.INFORMATION_PLAIN_TEXT:
                m_links = extract_from_text(info.raw_data, m_url)
            else:
                raise Exception("Internal error!")
        m_links.difference_update(base_urls)

        Logger.log("2")

        # If we have no links, abort now
        if not m_links:
            Logger.log_verbose("No output links found.")
            return

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = {
            url
            for url in m_links
            if url and not any(x in url for x in m_forbidden)
        }

        Logger.log("3")

        # Get only output links
        m_output_links = []
        for url in m_urls_allowed:
            try:
                if url not in Config.audit_scope:
                    m_output_links.append(url)
            except Exception, e:
                Logger.log_error_more_verbose(format_exc())
Esempio n. 12
0
class Spider(TestingPlugin):
    """
    This plugin is a web spider.
    """

    #--------------------------------------------------------------------------
    def get_accepted_types(self):
        return [URL]

    #--------------------------------------------------------------------------
    def run(self, info):

        m_return = []

        m_url = info.url

        Logger.log_verbose("Spidering URL: %s" % m_url)

        # Check if need follow first redirect, then follow the link.
        p = None
        try:
            allow_redirects = Config.audit_config.follow_redirects or \
                (info.depth == 0 and Config.audit_config.follow_first_redirect)
            p = download(m_url,
                         self.check_download,
                         allow_redirects=allow_redirects)
        except NetworkException, e:
            Logger.log_error_verbose("Error while processing %r: %s" %
                                     (m_url, str(e)))

        if not p:
            return m_return

        # Send back the data
        m_return.append(p)

        # TODO: If it's a 301 response, get the Location header

        # Get links
        m_forms = None
        if p.information_type == HTML.data_subtype:
            m_links = extract_from_html(p.raw_data, m_url)
            m_forms = extract_forms_from_html(p.raw_data, m_url)
            #m_links.update( extract_from_text(p.raw_data, m_url) )
        elif p.information_type == Text.data_subtype:
            m_links = extract_from_text(p.raw_data, m_url)
        else:
            return m_return
        try:
            m_links.remove(m_url)
        except Exception:
            pass

        # Do not follow URLs that contain certain keywords
        m_forbidden = [
            x for x in WordListLoader.get_wordlist(
                Config.plugin_config["wordlist_no_spider"])
        ]
        m_urls_allowed = [
            url for url in m_links if not any(x in url for x in m_forbidden)
        ]
        m_urls_not_allowed = m_links.difference(m_urls_allowed)
        if m_urls_not_allowed:
            Logger.log_more_verbose("Skipped forbidden URLs:\n    %s" %
                                    "\n    ".join(sorted(m_urls_not_allowed)))

        # Do not follow URLs out of scope
        m_urls_in_scope = []
        m_broken = []
        for url in m_urls_allowed:
            try:
                if url in Config.audit_scope:
                    m_urls_in_scope.append(url)
            except Exception:
                m_broken.append(url)
        if m_broken:
            if len(m_broken) == 1:
                Logger.log_more_verbose("Skipped uncrawlable URL: %s" %
                                        m_broken[0])
            else:
                Logger.log_more_verbose("Skipped uncrawlable URLs:\n    %s" %
                                        "\n    ".join(sorted(m_broken)))
        m_out_of_scope_count = len(m_urls_allowed) - len(
            m_urls_in_scope) - len(m_broken)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d links out of scope." %
                                    m_out_of_scope_count)

        if m_urls_in_scope:
            Logger.log_verbose("Found %d links in URL: %s" %
                               (len(m_urls_allowed), m_url))
        else:
            Logger.log_more_verbose("No links found in URL: %s" % m_url)

        # Convert to URL data type
        for u in m_urls_in_scope:
            try:
                p = parse_url(u)
                if p.scheme == "mailto":
                    m_resource = Email(p.netloc)
                elif p.scheme in ("http", "https"):
                    m_resource = URL(url=u, referer=m_url)
            except Exception:
                warn(format_exc(), RuntimeWarning)
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Get forms info
        if m_forms:
            m_forms_allowed = [
                url for url in m_forms
                if not any(x in url[0] for x in m_forbidden)
            ]
            m_forms_not_allowed = {x[0]
                                   for x in m_forms
                                   }.difference(x[0] for x in m_forms_allowed)
        else:
            m_forms_allowed = []
            m_forms_not_allowed = set()

        if m_forms_not_allowed:
            Logger.log_more_verbose("Skipped forbidden forms:\n    %s" %
                                    "\n    ".join(sorted(m_forms_not_allowed)))

        # Do not follow forms out of scope
        m_forms_in_scope = []
        m_broken = []
        for url in m_forms_allowed:
            try:
                if url[0] in Config.audit_scope:
                    m_forms_in_scope.append(url)
            except Exception:
                m_broken.append(url[0])

        if m_broken:
            if len(m_broken) == 1:
                Logger.log_more_verbose("Skipped uncrawlable forms: %s" %
                                        m_broken[0])
            else:
                Logger.log_more_verbose("Skipped uncrawlable forms:\n    %s" %
                                        "\n    ".join(sorted(m_broken)))
        m_out_of_scope_count = len(m_forms_allowed) - len(
            m_forms_in_scope) - len(m_broken)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d forms out of scope." %
                                    m_out_of_scope_count)

        if m_forms_in_scope:
            Logger.log_verbose("Found %d forms in URL: %s" %
                               (len(m_forms_in_scope), m_url))
        else:
            Logger.log_more_verbose("No forms found in URL: %s" % m_url)

        # Convert to URL data type
        for u in m_forms_in_scope:
            try:
                url = u[0]
                method = u[1]
                params = {x["name"]: x["value"] for x in u[2]}

                m_resource = URL(url=url,
                                 referer=m_url,
                                 method=method,
                                 post_params=params)
            except Exception:
                warn(format_exc(), RuntimeWarning)
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Send the results
        return m_return
Esempio n. 13
0
    def check_download(self, url, name, content_length, content_type):

        # Only accept content when the content type header is present.
        if not content_type:
            Logger.log_more_verbose("Skipping URL, missing content type: %s" %
                                    url)
            return False

        # Is the content length present?
        if content_length is not None:

            # Check the file doesn't have 0 bytes.
            if content_length <= 0:
                Logger.log_more_verbose("Skipping URL, empty content: %s" %
                                        url)
                return False

            # Check the file is not too big.
            if content_type.strip().lower().startswith("text/"):
                if content_length > 100000:
                    Logger.log_more_verbose(
                        "Skipping URL, content too large (%d bytes): %s" %
                        (content_length, url))
                    return False
            else:
                if content_length > 5000000:
                    Logger.log_more_verbose(
                        "Skipping URL, content too large (%d bytes): %s" %
                        (content_length, url))
                    return False

            # Approved!
            return True

        # Content length absent but likely points to a directory index.
        parsed_url = parse_url(url)
        if not parsed_url.filename:

            # Approved!
            return True

        # Extension absent.
        if not parsed_url.extension:

            # Approved!
            return True

        # Match against a known list of valid HTML extensions.
        # See: http://en.wikipedia.org/wiki/List_of_file_formats#Webpage
        if parsed_url.extension in (".xml", ".html", ".htm", ".xhtml", ".xht",
                                    ".mht", ".mhtml", ".maff", ".asp", ".aspx",
                                    ".bml", ".cfm", ".cgi", ".ihtml", ".jsp",
                                    ".las", ".lasso", ".lassoapp", ".pl",
                                    ".php", ".php3", ".phtml", ".rna", ".r",
                                    ".rnx", ".shtml", ".stm", ".atom", ".xml",
                                    ".eml", ".jsonld", ".metalink", ".met",
                                    ".rss", ".xml", ".markdown"):

            # Approved!
            return True

        # If URL path in blacklist?
        m_forbidden = [
            x for x in WordListLoader.get_wordlist(
                Config.plugin_config["wordlist_no_spider"])
        ]
        if any(x in url for x in m_forbidden):
            return False

        # Success!
        return True