def get_page(self, url, timeout=TIMEOUT): try: page = tools.urlopen(url, timeout=timeout) except (six.moves.urllib.error.URLError, six.moves.http_client.HTTPException) as e: # a network problem? page unavailable? wrong URL? logging.warning("Error opening %s, terminating: %s", url, tools.error_to_str(e)) return None return page
def detect(self, url, limit=None, exclude=None, timeout=TIMEOUT): logging.info("- %s", url) findings = [] original_url = url if not self.expected_url(url, limit, exclude): return {} try: page = tools.urlopen(url, timeout=timeout) url = page.geturl() except (six.moves.urllib.error.URLError, six.moves.http_client.HTTPException) as e: # a network problem? page unavailable? wrong URL? logging.warning("Error opening %s, terminating: %s", url, tools.error_to_str(e)) return {} if url != original_url: logging.info("` %s", url) if not self.expected_url(url, limit, exclude): return {} try: content = page.read() except (socket.timeout, six.moves.http_client.HTTPException, SSLError) as e: logging.info("Exception while reading %s, terminating: %s", url, tools.error_to_str(e)) return {} if six.PY3: content = content.decode() findings += self.check_url(url) # 'url' if page: findings += self.check_headers(page.info()) # 'headers' if content: findings += self.check_meta(content) # 'meta' findings += self.check_script(content) # 'script' findings += self.check_html(content) # 'html' self.follow_implies(findings) # 'implies' self.remove_duplicates(findings) self.remove_exclusions(findings) # 'excludes' self.add_categories(findings) return {url: findings}