def get_links(self, html_soup, original_url_split): """Get Link for desired types (e.g., a, link, img, script) :param html_soup: The page parsed by BeautifulSoup :param original_url_split: The URL of the page used to resolve relative links. :rtype: A sequence of Link objects """ # This is a weird html tag that defines the base URL of a page. base_url_split = original_url_split bases = html_soup.find_all('base') if bases: base = bases[0] if 'href' in base.attrs: base_url_split = get_clean_url_split(base['href']) links = [] for element_type in self.worker_config.types: if element_type not in TYPE_ATTRIBUTES: raise Exception("Unknown element type: {0}". format(element_type)) attribute = TYPE_ATTRIBUTES[element_type] element_links = html_soup.find_all(element_type) links.extend(self._get_links(element_links, attribute, base_url_split, original_url_split)) return links
def __init__(self, config, logger): self.config = config self.start_url_splits = [] for start_url in config.start_urls: self.start_url_splits.append(get_clean_url_split(start_url)) self.workers = [] self.input_queue = self.build_queue(config) self.output_queue = self.build_queue(config) self.logger = logger self.site = Site(self.start_url_splits, config, self.logger)
def _build_accepted_hosts(self, options, start_urls): hosts = set() urls = [] if self.options.accepted_hosts: urls = self.options.accepted_hosts.split(',') urls = urls + start_urls for url in urls: split_result = get_clean_url_split(url) hosts.add(split_result.netloc) return hosts
def test_get_absolute_url(self): base_url_split = get_clean_url_split( "https://www.example.com/hello/index.html") self.assertEqual("https://www.example2.com/test.js", get_absolute_url_split("//www.example2.com/test.js", base_url_split).geturl()) self.assertEqual("https://www.example.com/hello2/test.html", get_absolute_url_split("/hello2/test.html", base_url_split).geturl()) self.assertEqual("https://www.example.com/hello/test.html", get_absolute_url_split("test.html", base_url_split).geturl()) self.assertEqual("https://www.example.com/test.html", get_absolute_url_split("../test.html", base_url_split).geturl())
def get_page_crawler(self, url): url = self.get_url(url) url_split = get_clean_url_split(url) input_queue = compat.Queue.Queue() output_queue = compat.Queue.Queue() worker_config = WorkerConfig(username=None, password=None, types=['a', 'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB) worker_init = WorkerInit(worker_config=worker_config, input_queue=input_queue, output_queue=output_queue, logger=get_logger()) page_crawler = PageCrawler(worker_init) return page_crawler, url_split
def get_page_crawler(self, url): url = self.get_url(url) url_split = get_clean_url_split(url) input_queue = compat.Queue.Queue() output_queue = compat.Queue.Queue() worker_config = WorkerConfig(username=None, password=None, types=['a', 'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB, strict_mode=False) worker_init = WorkerInit(worker_config=worker_config, input_queue=input_queue, output_queue=output_queue, logger=get_logger()) page_crawler = PageCrawler(worker_init) return page_crawler, url_split
def test_clean_url_split(self): self.assertEqual("http://www.example.com", get_clean_url_split("www.example.com").geturl()) self.assertEqual("http://www.example.com", get_clean_url_split("//www.example.com").geturl()) self.assertEqual("http://www.example.com", get_clean_url_split("http://www.example.com").geturl()) self.assertEqual("http://www.example.com/", get_clean_url_split("www.example.com/").geturl()) self.assertEqual("http://www.example.com/", get_clean_url_split("//www.example.com/").geturl()) self.assertEqual("http://www.example.com/", get_clean_url_split("http://www.example.com/").geturl())
def _crawl_page(self, worker_input): page_crawl = None url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.worker_config.user_agent, self.auth_header) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) else: final_url_split = get_clean_url_split(response.final_url) mime_type = get_content_type(response.content.info()) links = [] is_html = mime_type == HTML_MIME_TYPE if is_html and worker_input.should_crawl: html_soup = BeautifulSoup(response.content, self.worker_config.parser) links = self.get_links(html_soup, final_url_split) else: self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) self.logger.exception("Exception occurred while crawling a page.") return page_crawl
def _crawl_page(self, worker_input): page_crawl = None url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) else: final_url_split = get_clean_url_split(response.final_url) mime_type = get_content_type(response.content.info()) links = [] is_html = mime_type == HTML_MIME_TYPE if is_html and worker_input.should_crawl: html_soup = BeautifulSoup(response.content, self.worker_config.parser) links = self.get_links(html_soup, final_url_split) else: self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) self.logger.exception("Exception occurred while crawling a page.") return page_crawl