def _check_redirect_to_other_host(self, url): redirect_url = self._redirect_url() if not redirect_url: return if url_utils.hostname_from_url(url) == url_utils.hostname_from_url(redirect_url): return msg = "%s redirects to other host %s" % (url, redirect_url) msg += "\nPlease provide non-redirecting URL" error_utils.exit_with_message(msg)
def _analyze(self, link): base_url_host = url_utils.hostname_from_url(self.base_url) if self.base_href: link = url_utils.make_absolute_url(self.base_href, link) link_host = url_utils.hostname_from_url(link) if not link_host or link_host == base_url_host: link = url_utils.make_absolute_url(self.base_url, link) link = url_utils.relative_url(link) return "internal", link else: return "external", link
def _analyze(self, link): link = url_utils.prepend_missing_scheme(link, self.base_url) if self.base_href_tag: link = url_utils.make_absolute_url(self.base_href_tag, link) link_host = url_utils.hostname_from_url(link) base_url_host = url_utils.hostname_from_url(self.base_url) if not link_host or link_host == base_url_host: link = url_utils.make_absolute_url(self.base_url, link) link = url_utils.relative_url(link) return "internal", link else: return "external", link
def __init__(self, page_start, sqlite_file, pages_list_file, config_to_save): self.page_start = page_start self.page_host = url_utils.hostname_from_url(page_start) self.sqlite_file = sqlite_file self.pages_list = self._parse_pages_list_file( pages_list_file) if pages_list_file else False self.config_to_save = config_to_save self.conn = None
def _get_resource_id(self, url, is_truncated): page_host = url_utils.hostname_from_url(self.url) relative_url = url_utils.internal_relative_url(url, page_host) if relative_url is False: is_external = 1 else: url = relative_url is_external = 0 c = self.conn.cursor() c.execute( "SELECT id FROM devtools_resource WHERE url = ? AND is_truncated = ? AND is_external = ?", (url, is_truncated, is_external)) result = c.fetchone() if result: return result[0] c.execute( "INSERT INTO devtools_resource (url, is_truncated, is_external) VALUES (?, ?, ?)", (url, is_truncated, is_external)) return c.lastrowid
def __init__(self): self.page_host = url_utils.hostname_from_url(config.start_url) self.pages_list = self._parse_pages_list_file() self.conn = None