Python extract_from_html Exemples, golismero.api.net.scraper.extract_from_html Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : fingerprint_os.py Projet : 0day1day/golismero

    def is_URL_in_windows(self, main_url):
        """
        Detect if platform is Windows or \*NIX. To do this, get the first link, in scope, and
        does two resquest. If are the same response, then, platform are Windows. Else are \*NIX.

        :returns: True, if the remote host is a Windows system. False is \*NIX or None if unknown.
        :rtype: bool
        """
        m_forbidden = (
            "logout",
            "logoff",
            "exit",
            "sigout",
            "signout",
        )

        # Get the main web page
        m_r = download(main_url, callback=self.check_download)
        if not m_r or not m_r.raw_data:
            return None
        discard_data(m_r)

        # Get the first link
        m_links = None
        try:
            if m_r.information_type == Information.INFORMATION_HTML:
                m_links = extract_from_html(m_r.raw_data, main_url)
            else:
                m_links = extract_from_text(m_r.raw_data, main_url)
        except TypeError,e:
            Logger.log_error_more_verbose("Plugin error: %s" % format_exc())
            return None

Exemple #2

0

Afficher le fichier

Fichier : fingerprint_os.py Projet : sechacking/golismero

    def is_URL_in_windows(self, main_url):
        """
        Detect if platform is Windows or \*NIX. To do this, get the first link, in scope, and
        does two resquest. If are the same response, then, platform are Windows. Else are \*NIX.

        :returns: True, if the remote host is a Windows system. False is \*NIX or None if unknown.
        :rtype: bool
        """
        m_forbidden = (
            "logout",
            "logoff",
            "exit",
            "sigout",
            "signout",
        )

        # Get the main web page
        m_r = download(main_url, callback=self.check_download)
        if not m_r or not m_r.raw_data:
            return None
        discard_data(m_r)

        # Get the first link
        m_links = None
        try:
            if m_r.information_type == Information.INFORMATION_HTML:
                m_links = extract_from_html(m_r.raw_data, main_url)
            else:
                m_links = extract_from_text(m_r.raw_data, main_url)
        except TypeError, e:
            Logger.log_error_more_verbose("Plugin error: %s" % format_exc())
            return None

Exemple #3

0

Afficher le fichier

Fichier : suspicious_url.py Projet : IFGHou/golismero

    def analyze_html(self, info):


        #----------------------------------------------------------------------
        # Get malware suspicious links.

        Logger.log_more_verbose("Processing HTML: %s" % info.identity)

        # Load the malware wordlist.
        wordlist_filename = Config.plugin_config["malware_sites"]
        try:
            wordlist = WordListLoader.get_advanced_wordlist_as_list(
                wordlist_filename)
        except WordlistNotFound:
            Logger.log_error("Wordlist '%s' not found.." % wordlist_filename)
            return
        except TypeError:
            Logger.log_error(
                "Wordlist '%s' is not a file." % wordlist_filename)
            return
        if not wordlist:
            Logger.log_error("Wordlist '%s' is empty." % wordlist_filename)

        Logger.log("1")

        # Get links
        base_urls = set()
        for url in info.find_linked_data(Data.TYPE_RESOURCE,
                                         Resource.RESOURCE_URL):
            m_url = url.url
            base_urls.add(m_url)
            if info.information_type == Information.INFORMATION_HTML:
                m_links = extract_from_html(info.raw_data, m_url)
                m_links.update( extract_from_text(info.raw_data, m_url) )
            elif info.information_type == Information.INFORMATION_PLAIN_TEXT:
                m_links = extract_from_text(info.raw_data, m_url)
            else:
                raise Exception("Internal error!")
        m_links.difference_update(base_urls)

        Logger.log("2")

        # If we have no links, abort now
        if not m_links:
            Logger.log_verbose("No output links found.")
            return

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = {
            url for url in m_links
            if url and not any(x in url for x in m_forbidden)
        }

        Logger.log("3")

        # Get only output links
        m_output_links = []
        for url in m_urls_allowed:
            try:
                if url not in Config.audit_scope:
                    m_output_links.append(url)
            except Exception, e:
                Logger.log_error_more_verbose(format_exc())

Exemple #4

0

Afficher le fichier

Fichier : fingerprint_os.py Projet : atimorin/golismero

    def is_URL_in_windows(self, main_url):
        """
        Detect if platform is Windows or \*NIX. To do this, get the first link, in scope, and
        does two resquest. If are the same response, then, platform are Windows. Else are \*NIX.

        :returns: True, if the remote host is a Windows system. False is \*NIX or None if unknown.
        :rtype: bool
        """
        m_forbidden = (
            "logout",
            "logoff",
            "exit",
            "sigout",
            "signout",
        )

        # Get the main web page
        m_r = download(main_url, callback=self.check_download)
        if not m_r:
            return None
        discard_data(m_r)

        # Get the first link
        if m_r.information_type == Information.INFORMATION_HTML:
            m_links = extract_from_html(m_r.raw_data, main_url)
        else:
            m_links = extract_from_text(m_r.raw_data, main_url)

        if not m_links:
            return None

        # Get the first link of the page that's in scope of the audit
        m_first_link = None
        for u in m_links:
            if u in Config.audit_scope and not any(x in u for x in m_forbidden):
                m_first_link = u
                break

        if not m_first_link:
            return None

        # Now get two request to the links. One to the original URL and other
        # as upper URL.

        # Original
        m_response_orig  = HTTP.get_url(m_first_link, callback=self.check_response)  # FIXME handle exceptions!
        discard_data(m_response_orig)
        # Uppercase
        m_response_upper = HTTP.get_url(m_first_link.upper(), callback=self.check_response)  # FIXME handle exceptions!
        discard_data(m_response_upper)
        # Compare them
        m_orig_data      = m_response_orig.raw_response  if m_response_orig  else ""
        m_upper_data     = m_response_upper.raw_response if m_response_upper else ""
        m_match_level    = get_diff_ratio(m_orig_data, m_upper_data)

        # If the responses are equal by 90%, two URL are the same => Windows; else => *NIX
        m_return = None
        if m_match_level > 0.95:
            m_return = True
        else:
            m_return = False

        return m_return

Exemple #5

0

Afficher le fichier

class Spider(TestingPlugin):
    """
    This plugin is a web spider.
    """

    #--------------------------------------------------------------------------
    def get_accepted_types(self):
        return [URL]

    #--------------------------------------------------------------------------
    def run(self, info):

        m_return = []

        m_url = info.url
        Logger.log_verbose("Spidering URL: %s" % m_url)

        # Check if need follow first redirect, then follow the link.
        p = None
        try:
            allow_redirects = Config.audit_config.follow_redirects or \
                (info.depth == 0 and Config.audit_config.follow_first_redirect)
            p = download(m_url,
                         self.check_download,
                         allow_redirects=allow_redirects)
        except NetworkException, e:
            Logger.log_error_verbose("Error while processing %r: %s" %
                                     (m_url, str(e)))

        if not p:
            return m_return

        # Send back the data
        m_return.append(p)

        # TODO: If it's a 301 response, get the Location header

        # Get links
        m_forms = None
        if p.information_type == HTML.data_subtype:
            m_links = extract_from_html(p.raw_data, m_url)
            m_forms = extract_forms_from_html(p.raw_data, m_url)
            #m_links.update( extract_from_text(p.raw_data, m_url) )
        elif p.information_type == Text.data_subtype:
            m_links = extract_from_text(p.raw_data, m_url)
        else:
            return m_return
        try:
            m_links.remove(m_url)
        except Exception:
            pass

        # Do not follow URLs that contain certain keywords
        m_forbidden = [
            x for x in WordListLoader.get_wordlist_as_list(
                Config.plugin_config["wordlist_no_spider"])
        ]
        m_urls_allowed = [
            url for url in m_links if not any(x in url for x in m_forbidden)
        ]
        m_urls_not_allowed = m_links.difference(m_urls_allowed)
        if m_urls_not_allowed:
            Logger.log_more_verbose("Skipped forbidden URLs:\n    %s" %
                                    "\n    ".join(sorted(m_urls_not_allowed)))

        # Do not follow URLs out of scope
        m_urls_in_scope = []
        m_broken = []
        for url in m_urls_allowed:
            try:
                if url in Config.audit_scope:
                    m_urls_in_scope.append(url)
            except Exception:
                m_broken.append(url)
        if m_broken:
            if len(m_broken) == 1:
                Logger.log_more_verbose("Skipped uncrawlable URL: %s" %
                                        m_broken[0])
            else:
                Logger.log_more_verbose("Skipped uncrawlable URLs:\n    %s" %
                                        "\n    ".join(sorted(m_broken)))
        m_out_of_scope_count = len(m_urls_allowed) - len(
            m_urls_in_scope) - len(m_broken)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d links out of scope." %
                                    m_out_of_scope_count)

        if m_urls_in_scope:
            Logger.log_verbose("Found %d links in URL: %s" %
                               (len(m_urls_allowed), m_url))
        else:
            Logger.log_more_verbose("No links found in URL: %s" % m_url)

        # Convert to URL data type
        for u in m_urls_in_scope:
            try:
                p = parse_url(u)
                if p.scheme == "mailto":
                    m_resource = Email(p.netloc)
                elif p.scheme in ("http", "https"):
                    m_resource = URL(url=u, referer=m_url)
            except Exception:
                warn(format_exc(), RuntimeWarning)
            print m_resource
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Get forms info
        if m_forms:
            m_forms_allowed = [
                url for url in m_forms
                if not any(x in url[0] for x in m_forbidden)
            ]
            m_forms_not_allowed = {x[0]
                                   for x in m_forms
                                   }.difference(x[0] for x in m_forms_allowed)
        else:
            m_forms_allowed = []
            m_forms_not_allowed = set()

        if m_forms_not_allowed:
            Logger.log_more_verbose("Skipped forbidden forms:\n    %s" %
                                    "\n    ".join(sorted(m_forms_not_allowed)))

        # Do not follow forms out of scope
        m_forms_in_scope = []
        m_broken = []
        for url in m_forms_allowed:
            try:
                if url[0] in Config.audit_scope:
                    m_forms_in_scope.append(url)
            except Exception:
                m_broken.append(url[0])

        if m_broken:
            if len(m_broken) == 1:
                Logger.log_more_verbose("Skipped uncrawlable forms: %s" %
                                        m_broken[0])
            else:
                Logger.log_more_verbose("Skipped uncrawlable forms:\n    %s" %
                                        "\n    ".join(sorted(m_broken)))
        m_out_of_scope_count = len(m_forms_allowed) - len(
            m_forms_in_scope) - len(m_broken)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d forms out of scope." %
                                    m_out_of_scope_count)

        if m_forms_in_scope:
            Logger.log_verbose("Found %d forms in URL: %s" %
                               (len(m_forms_in_scope), m_url))
        else:
            Logger.log_more_verbose("No forms found in URL: %s" % m_url)

        # Convert to URL data type
        for u in m_forms_in_scope:
            try:
                url = u[0]
                method = u[1]
                params = {x["name"]: x["value"] for x in u[2]}

                m_resource = URL(url=url,
                                 referer=m_url,
                                 method=method,
                                 post_params=params)
            except Exception:
                warn(format_exc(), RuntimeWarning)
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Send the results
        return m_return

Exemple #6

0

Afficher le fichier

class Spider(TestingPlugin):
    """
    This plugin is a web spider.
    """

    #----------------------------------------------------------------------
    def get_accepted_info(self):
        return [Url]

    #----------------------------------------------------------------------
    def recv_info(self, info):

        m_return = []

        m_url = info.url
        m_depth = info.depth

        # Check depth
        if Config.audit_config.depth is not None and m_depth > Config.audit_config.depth:
            Logger.log_more_verbose("Spider depth level exceeded for URL: %s" %
                                    m_url)
            return m_return

        Logger.log_verbose("Spidering URL: %r" % m_url)

        # Check if need follow first redirect
        p = None
        try:
            allow_redirects = Config.audit_config.follow_redirects or \
                             (m_depth == 0 and Config.audit_config.follow_first_redirect)
            p = download(m_url,
                         self.check_download,
                         allow_redirects=allow_redirects)
        except NetworkException, e:
            Logger.log_more_verbose("Error while processing %r: %s" %
                                    (m_url, str(e)))
        if not p:
            return m_return

        # Send back the data
        m_return.append(p)

        # TODO: If it's a 301 response, get the Location header

        # Get links
        if p.information_type == Information.INFORMATION_HTML:
            m_links = extract_from_html(p.raw_data, m_url)
        else:
            m_links = extract_from_text(p.raw_data, m_url)
        try:
            m_links.remove(m_url)
        except Exception:
            pass

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = [
            url for url in m_links if not any(x in url for x in m_forbidden)
        ]
        m_urls_not_allowed = m_links.difference(m_urls_allowed)
        if m_urls_not_allowed:
            Logger.log_more_verbose("Skipped forbidden URLs:\n    %s" %
                                    "\n    ".join(sorted(m_urls_not_allowed)))

        # Do not follow URLs out of scope
        m_out_of_scope_count = len(m_urls_allowed)
        m_urls_allowed = [
            url for url in m_urls_allowed if url in Config.audit_scope
        ]
        m_out_of_scope_count -= len(m_urls_allowed)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d links out of scope." %
                                    m_out_of_scope_count)

        if m_urls_allowed:
            Logger.log_verbose("Found %d links in URL: %s" %
                               (len(m_urls_allowed), m_url))
        else:
            Logger.log_verbose("No links found in URL: %s" % m_url)

        # Convert to Url data type
        for u in m_urls_allowed:
            m_resource = Url(url=u, depth=m_depth + 1, referer=m_url)
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Send the results
        return m_return

Exemple #7

0

Afficher le fichier

Fichier : fingerprint_os.py Projet : damarsan/golismero

    def is_URL_in_windows(self, main_url):
        """
        Detect if platform is Windows or \*NIX. To do this, get the first link, in scope, and
        does two resquest. If are the same response, then, platform are Windows. Else are \*NIX.

        :returns: True, if the remote host is a Windows system. False is \*NIX or None if unknown.
        :rtype: bool
        """
        m_forbidden = (
            "logout",
            "logoff",
            "exit",
            "sigout",
            "signout",
        )

        # Get the main web page
        m_r = download(main_url, callback=self.check_download)
        if not m_r:
            return None
        discard_data(m_r)

        # Get the first link
        if m_r.information_type == Information.INFORMATION_HTML:
            m_links = extract_from_html(m_r.raw_data, main_url)
        else:
            m_links = extract_from_text(m_r.raw_data, main_url)

        if not m_links:
            return None

        # Get the first link of the page that's in scope of the audit
        m_first_link = None
        for u in m_links:
            if u in Config.audit_scope and not any(x in u
                                                   for x in m_forbidden):
                m_first_link = u
                break

        if not m_first_link:
            return None

        # Now get two request to the links. One to the original URL and other
        # as upper URL.

        # Original
        m_response_orig = HTTP.get_url(
            m_first_link,
            callback=self.check_response)  # FIXME handle exceptions!
        discard_data(m_response_orig)
        # Uppercase
        m_response_upper = HTTP.get_url(
            m_first_link.upper(),
            callback=self.check_response)  # FIXME handle exceptions!
        discard_data(m_response_upper)
        # Compare them
        m_orig_data = m_response_orig.raw_response if m_response_orig else ""
        m_upper_data = m_response_upper.raw_response if m_response_upper else ""
        m_match_level = get_diff_ratio(m_orig_data, m_upper_data)

        # If the responses are equal by 90%, two URL are the same => Windows; else => *NIX
        m_return = None
        if m_match_level > 0.95:
            m_return = True
        else:
            m_return = False

        return m_return

Exemple #8

0

Afficher le fichier

    def analyze_html(self, info):

        #----------------------------------------------------------------------
        # Get malware suspicious links.

        Logger.log_more_verbose("Processing HTML: %s" % info.identity)

        # Load the malware wordlist.
        wordlist_filename = Config.plugin_config["malware_sites"]
        try:
            wordlist = WordListLoader.get_wordlist_as_list(wordlist_filename)
        except WordlistNotFound:
            Logger.log_error("Wordlist '%s' not found.." % wordlist_filename)
            return
        except TypeError:
            Logger.log_error("Wordlist '%s' is not a file." %
                             wordlist_filename)
            return
        if not wordlist:
            Logger.log_error("Wordlist '%s' is empty." % wordlist_filename)

        Logger.log("1")

        # Get links
        base_urls = set()
        for url in info.find_linked_data(Data.TYPE_RESOURCE,
                                         Resource.RESOURCE_URL):
            m_url = url.url
            base_urls.add(m_url)
            if info.information_type == Information.INFORMATION_HTML:
                m_links = extract_from_html(info.raw_data, m_url)
                m_links.update(extract_from_text(info.raw_data, m_url))
            elif info.information_type == Information.INFORMATION_PLAIN_TEXT:
                m_links = extract_from_text(info.raw_data, m_url)
            else:
                raise Exception("Internal error!")
        m_links.difference_update(base_urls)

        Logger.log("2")

        # If we have no links, abort now
        if not m_links:
            Logger.log_verbose("No output links found.")
            return

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist_as_raw(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = {
            url
            for url in m_links
            if url and not any(x in url for x in m_forbidden)
        }

        Logger.log("3")

        # Get only output links
        m_output_links = []
        for url in m_urls_allowed:
            try:
                if url not in Config.audit_scope:
                    m_output_links.append(url)
            except Exception, e:
                Logger.log_error_more_verbose(format_exc())

Exemple #9

0

Afficher le fichier

Fichier : spider.py Projet : sechacking/golismero

class Spider(TestingPlugin):
    """
    This plugin is a web spider.
    """

    #----------------------------------------------------------------------
    def get_accepted_info(self):
        return [Url]

    #----------------------------------------------------------------------
    def recv_info(self, info):

        m_return = []

        m_url = info.url

        Logger.log_verbose("Spidering URL: %r" % m_url)

        # Check if need follow first redirect
        p = None
        try:
            allow_redirects = Config.audit_config.follow_redirects or \
                             (info.depth == 0 and Config.audit_config.follow_first_redirect)
            p = download(m_url,
                         self.check_download,
                         allow_redirects=allow_redirects)
        except NetworkException, e:
            Logger.log_more_verbose("Error while processing %r: %s" %
                                    (m_url, str(e)))
        if not p:
            return m_return

        # Send back the data
        m_return.append(p)

        # TODO: If it's a 301 response, get the Location header

        # Get links
        if p.information_type == Information.INFORMATION_HTML:
            m_links = extract_from_html(p.raw_data, m_url)
        else:
            m_links = extract_from_text(p.raw_data, m_url)
        try:
            m_links.remove(m_url)
        except Exception:
            pass

        # Do not follow URLs that contain certain keywords
        m_forbidden = WordListLoader.get_wordlist(
            Config.plugin_config["wordlist_no_spider"])
        m_urls_allowed = [
            url for url in m_links if not any(x in url for x in m_forbidden)
        ]
        m_urls_not_allowed = m_links.difference(m_urls_allowed)
        if m_urls_not_allowed:
            Logger.log_more_verbose("Skipped forbidden URLs:\n    %s" %
                                    "\n    ".join(sorted(m_urls_not_allowed)))

        # Do not follow URLs out of scope
        m_urls_in_scope = []
        m_broken = []
        for url in m_urls_allowed:
            try:
                if url in Config.audit_scope:
                    m_urls_in_scope.append(url)
            except Exception:
                m_broken.append(url)
        if m_broken:
            if len(m_broken) == 1:
                Logger.log_more_verbose("Skipped uncrawlable URL: %s" %
                                        m_broken[0])
            else:
                Logger.log_more_verbose("Skipped uncrawlable URLs:\n    %s" %
                                        "\n    ".join(sorted(m_broken)))
        m_out_of_scope_count = len(m_urls_allowed) - len(
            m_urls_in_scope) - len(m_broken)
        if m_out_of_scope_count:
            Logger.log_more_verbose("Skipped %d links out of scope." %
                                    m_out_of_scope_count)

        if m_urls_in_scope:
            Logger.log_verbose("Found %d links in URL: %s" %
                               (len(m_urls_in_scope), m_url))
        else:
            Logger.log_verbose("No links found in URL: %s" % m_url)

        # Convert to Url data type
        for u in m_urls_in_scope:
            try:
                p = parse_url(u)
                if p.scheme == "mailto":
                    m_resource = Email(p.netloc)
                elif p.scheme in ("http", "https"):
                    m_resource = Url(url=u, referer=m_url)
            except Exception:
                warn(format_exc(), RuntimeWarning)
            m_resource.add_resource(info)
            m_return.append(m_resource)

        # Send the results
        return m_return