def _build_multi_hosts(self, options, start_urls): hosts = {} extra_hosts = set() if options.accepted_hosts: for url in options.accepted_hosts.split(','): split_result = get_clean_url_split(url) extra_hosts.add(split_result.netloc) for start_url in start_urls: split_result = get_clean_url_split(start_url) host = split_result.netloc hosts[host] = extra_hosts.union(host) return hosts
def get_links(self, html_soup, original_url_split): """Gets links for desired types (e.g., a, link, img, script) :param html_soup: The page parsed by BeautifulSoup :param original_url_split: The URL of the page used to resolve relative links. :rtype: A sequence of Link objects """ # This is a weird html tag that defines the base URL of a page. base_url_split = original_url_split bases = html_soup.find_all('base') if bases: base = bases[0] if 'href' in base.attrs: base_url_split = get_clean_url_split(base['href']) links = [] for element_type in self.worker_config.types: if element_type not in TYPE_ATTRIBUTES: raise Exception( "Unknown element type: {0}".format(element_type)) attribute = TYPE_ATTRIBUTES[element_type] element_links = html_soup.find_all(element_type) links.extend(self._get_links( element_links, attribute, base_url_split, original_url_split)) return links
def get_links(self, html_soup, original_url_split): """Gets links for desired types (e.g., a, link, img, script) :param html_soup: The page parsed by BeautifulSoup :param original_url_split: The URL of the page used to resolve relative links. :rtype: A sequence of Link objects """ # This is a weird html tag that defines the base URL of a page. base_url_split = original_url_split bases = html_soup.find_all('base') if bases: base = bases[0] if 'href' in base.attrs: base_url_split = get_clean_url_split(base['href']) links = [] for element_type in self.worker_config.types: if element_type not in TYPE_ATTRIBUTES: raise Exception( "Unknown element type: {0}".format(element_type)) attribute = TYPE_ATTRIBUTES[element_type] element_links = html_soup.find_all(element_type) links.extend( self._get_links(element_links, attribute, base_url_split, original_url_split)) return links
def _get_prefix_content(self, content, prefix=None): if not prefix: index = content.find(",") prefix = get_clean_url_split(content[:index]) content = content[index+1:] return (prefix, content)
def _build_single_hosts(self, options, start_urls): hosts = set() urls = [] if options.accepted_hosts: urls = options.accepted_hosts.split(',') urls = urls + start_urls for url in urls: split_result = get_clean_url_split(url) hosts.add(split_result.netloc) return hosts
def test_get_absolute_url(self): base_url_split = get_clean_url_split( "https://www.example.com/hello/index.html") self.assertEqual( "https://www.example2.com/test.js", get_absolute_url_split( "//www.example2.com/test.js", base_url_split).geturl()) self.assertEqual( "https://www.example.com/hello2/test.html", get_absolute_url_split( "/hello2/test.html", base_url_split).geturl()) self.assertEqual( "https://www.example.com/hello/test.html", get_absolute_url_split("test.html", base_url_split).geturl()) self.assertEqual( "https://www.example.com/test.html", get_absolute_url_split("../test.html", base_url_split).geturl())
def get_page_crawler(self, url): url = self.get_url(url) url_split = get_clean_url_split(url) input_queue = compat.Queue.Queue() output_queue = compat.Queue.Queue() worker_config = WorkerConfig( username=None, password=None, types=['a', 'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB, strict_mode=False, prefer_server_encoding=False, extra_headers=[]) worker_init = WorkerInit( worker_config=worker_config, input_queue=input_queue, output_queue=output_queue, logger=get_logger()) page_crawler = PageCrawler(worker_init) return page_crawler, url_split
def test_clean_url_split(self): self.assertEqual("http://www.example.com", get_clean_url_split("www.example.com").geturl()) self.assertEqual("http://www.example.com", get_clean_url_split("//www.example.com").geturl()) self.assertEqual( "http://www.example.com", get_clean_url_split("http://www.example.com").geturl()) self.assertEqual("http://www.example.com/", get_clean_url_split("www.example.com/").geturl()) self.assertEqual("http://www.example.com/", get_clean_url_split("//www.example.com/").geturl()) self.assertEqual( "http://www.example.com/", get_clean_url_split("http://www.example.com/").geturl()) self.assertEqual( "http://www.example.com/media%20gallery", get_clean_url_split( "http://www.example.com/media gallery").geturl())
def test_clean_url_split(self): self.assertEqual( "http://www.example.com", get_clean_url_split("www.example.com").geturl()) self.assertEqual( "http://www.example.com", get_clean_url_split("//www.example.com").geturl()) self.assertEqual( "http://www.example.com", get_clean_url_split("http://www.example.com").geturl()) self.assertEqual( "http://www.example.com/", get_clean_url_split("www.example.com/").geturl()) self.assertEqual( "http://www.example.com/", get_clean_url_split("//www.example.com/").geturl()) self.assertEqual( "http://www.example.com/", get_clean_url_split("http://www.example.com/").geturl())
def _crawl_page(self, worker_input): page_crawl = None erroneous_content = [] missing_content = [] url_split_to_crawl = worker_input.url_split try: response = open_url( self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header, extra_headers=self.worker_config.extra_headers, logger=self.logger) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=None, site_origin=worker_input.site_origin) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: # Something bad happened when opening the url exception = ExceptionStr( unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: final_url_split = get_clean_url_split(response.final_url) message = response.content.info() mime_type = get_content_type(message) if self.worker_config.prefer_server_encoding: charset = get_charset(message) else: charset = None links = [] is_html = mime_type == HTML_MIME_TYPE process_time = None if is_html and worker_input.should_crawl: start = time.time() html_soup = BeautifulSoup( response.content, self.worker_config.parser, from_encoding=charset) links = self.get_links(html_soup, final_url_split) if self._has_content_to_check(worker_input): (missing_content, erroneous_content) =\ self.check_content( unicode(html_soup), html_soup, url_split_to_crawl, final_url_split, worker_input.content_check) process_time = time.time() - start else: self.logger.debug( "Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) if self._has_content_to_check(worker_input): text_content = self.get_text_content( response.content.read(), charset) (missing_content, erroneous_content) =\ self.check_content( text_content, None, url_split_to_crawl, final_url_split, worker_input.content_check) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html, depth=worker_input.depth, response_time=response.response_time, process_time=process_time, site_origin=worker_input.site_origin, missing_content=missing_content, erroneous_content=erroneous_content) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=None, process_time=None, site_origin=worker_input.site_origin) self.logger.exception("Exception occurred while crawling a page.") return page_crawl
def _crawl_page(self, worker_input): page_crawl = None erroneous_content = [] missing_content = [] url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header, extra_headers=self.worker_config.extra_headers, logger=self.logger) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=None, site_origin=worker_input.site_origin) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: final_url_split = get_clean_url_split(response.final_url) message = response.content.info() mime_type = get_content_type(message) if self.worker_config.prefer_server_encoding: charset = get_charset(message) else: charset = None links = [] is_html = mime_type == HTML_MIME_TYPE process_time = None if is_html and worker_input.should_crawl: start = time.time() html_soup = BeautifulSoup(response.content, self.worker_config.parser, from_encoding=charset) links = self.get_links(html_soup, final_url_split) if self._has_content_to_check(worker_input): (missing_content, erroneous_content) =\ self.check_content( unicode(html_soup), html_soup, url_split_to_crawl, final_url_split, worker_input.content_check) process_time = time.time() - start else: self.logger.debug( "Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) if self._has_content_to_check(worker_input): text_content = self.get_text_content( response.content.read(), charset) (missing_content, erroneous_content) =\ self.check_content( text_content, None, url_split_to_crawl, final_url_split, worker_input.content_check) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html, depth=worker_input.depth, response_time=response.response_time, process_time=process_time, site_origin=worker_input.site_origin, missing_content=missing_content, erroneous_content=erroneous_content) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=None, process_time=None, site_origin=worker_input.site_origin) self.logger.exception("Exception occurred while crawling a page.") return page_crawl
def _process_start_urls(self): for start_url in self.start_urls: self.start_url_splits.append(get_clean_url_split(start_url))