Esempio n. 1
0
 def get_page(self, url, timeout=TIMEOUT):
     try:
         page = tools.urlopen(url, timeout=timeout)
     except (six.moves.urllib.error.URLError, six.moves.http_client.HTTPException) as e:
         # a network problem? page unavailable? wrong URL?
         logging.warning("Error opening %s, terminating: %s", url, tools.error_to_str(e))
         return None
     return page
Esempio n. 2
0
 def get_page(self, url, timeout=TIMEOUT):
     try:
         page = tools.urlopen(url, timeout=timeout)
     except (six.moves.urllib.error.URLError, six.moves.http_client.HTTPException) as e:
         # a network problem? page unavailable? wrong URL?
         logging.warning("Error opening %s, terminating: %s", url, tools.error_to_str(e))
         return None
     return page
Esempio n. 3
0
    def detect(self, url, limit=None, exclude=None, timeout=TIMEOUT):
        logging.info("- %s", url)

        findings = []
        original_url = url

        if not self.expected_url(url, limit, exclude):
            return {}

        try:
            page = tools.urlopen(url, timeout=timeout)
            url = page.geturl()
        except (six.moves.urllib.error.URLError, six.moves.http_client.HTTPException) as e:
            # a network problem? page unavailable? wrong URL?
            logging.warning("Error opening %s, terminating: %s", url, tools.error_to_str(e))
            return {}

        if url != original_url:
            logging.info("` %s", url)

            if not self.expected_url(url, limit, exclude):
                return {}

        try:
            content = page.read()
        except (socket.timeout, six.moves.http_client.HTTPException, SSLError) as e:
            logging.info("Exception while reading %s, terminating: %s", url, tools.error_to_str(e))
            return {}

        if six.PY3:
            content = content.decode()

        findings += self.check_url(url)  # 'url'
        if page:
            findings += self.check_headers(page.info())  # 'headers'
        if content:
            findings += self.check_meta(content)  # 'meta'
            findings += self.check_script(content)  # 'script'
            findings += self.check_html(content)  # 'html'

        self.follow_implies(findings)  # 'implies'
        self.remove_duplicates(findings)
        self.remove_exclusions(findings)  # 'excludes'
        self.add_categories(findings)

        return {url: findings}