def crawl(self): worker_init = WorkerInit(self.config.worker_config, self.input_queue, self.output_queue, self.build_logger()) self.workers = self.get_workers(self.config, worker_init) queue_size = len(self.start_url_splits) for start_url_split in self.start_url_splits: self.input_queue.put( WorkerInput(start_url_split, True, 0, start_url_split.netloc, self.config.content_check), False) self.start_workers(self.workers, self.input_queue, self.output_queue) self.start_progress() while True: page_crawl = self.output_queue.get() queue_size -= 1 new_worker_inputs = self.process_page_crawl(page_crawl) # We only process new pages if we did not exceed configured depth for worker_input in new_worker_inputs: queue_size += 1 self.input_queue.put(worker_input, False) self.progress(page_crawl, len(self.site.pages), queue_size) if queue_size <= 0: self.stop_workers(self.workers, self.input_queue, self.output_queue) self.stop_progress() return self.site
def test_base_url(self): page_crawler, url_split = self.get_page_crawler("/alone.html") page_crawl = page_crawler._crawl_page( WorkerInput(url_split, True, 0, url_split.netloc)) self.assertEqual(1, len(page_crawl.links)) self.assertEqual('http://www.example.com/test.html', page_crawl.links[0].url_split.geturl())
def test_crawl_404(self): page_crawler, url_split = self.get_page_crawler( "/sub/small_image_bad.gif") page_crawl = page_crawler._crawl_page( WorkerInput(url_split, True, 0, url_split.netloc)) self.assertEqual(404, page_crawl.status) self.assertFalse(page_crawl.links) self.assertFalse(page_crawl.is_html) self.assertFalse(page_crawl.is_timeout) self.assertFalse(page_crawl.is_redirect)
def test_crawl_resource(self): page_crawler, url_split = self.get_page_crawler("/sub/small_image.gif") page_crawl = page_crawler._crawl_page( WorkerInput(url_split, True, 0, url_split.netloc)) self.assertEqual(200, page_crawl.status) self.assertFalse(page_crawl.links) self.assertFalse(page_crawl.is_html) self.assertFalse(page_crawl.is_timeout) self.assertFalse(page_crawl.is_redirect) self.assertTrue(page_crawl.exception is None)
def test_page_crawler(self): page_crawler, url_split = self.get_page_crawler("/index.html") input_queue = page_crawler.input_queue output_queue = page_crawler.output_queue input_queue.put(WorkerInput(url_split, True, 0, url_split.netloc)) input_queue.put(WORK_DONE) page_crawler.crawl_page_forever() page_crawl = output_queue.get() self.assertEqual(200, page_crawl.status) self.assertTrue(len(page_crawl.links) > 0)
def process_links(self, page_crawl): links_to_process = [] source_url_split = page_crawl.original_url_split if page_crawl.final_url_split: source_url_split = page_crawl.final_url_split for link in page_crawl.links: url_split = link.url_split if not self.config.should_download(url_split): self.logger.debug( "Won't download %s. Is local? %s", url_split, LazyLogParam(lambda: self.config.is_local(url_split))) continue page_status = self.page_statuses.get(url_split, None) page_source = PageSource(source_url_split, link.source_str, link.target) if not page_status: # We never encountered this url before self.page_statuses[url_split] = PageStatus( PAGE_QUEUED, [page_source]) should_crawl = self.config.should_crawl( url_split, page_crawl.depth) links_to_process.append( WorkerInput(url_split, should_crawl, page_crawl.depth + 1, page_crawl.site_origin, self.config.content_check)) elif page_status.status == PAGE_CRAWLED: # Already crawled. Add source if url_split in self.pages: self.pages[url_split].add_sources([page_source]) else: # TODO the final url is different. need a way to link it... pass elif page_status.status == PAGE_QUEUED: # Already queued for crawling. Add source. page_status.sources.append(page_source) return links_to_process
def test_crawl_page(self): page_crawler, url_split = self.get_page_crawler("/index.html") page_crawl = page_crawler._crawl_page( WorkerInput(url_split, True, 0, url_split.netloc)) self.assertEqual(200, page_crawl.status) self.assertTrue(page_crawl.is_html) self.assertFalse(page_crawl.is_timeout) self.assertFalse(page_crawl.is_redirect) self.assertTrue(page_crawl.exception is None) a_links = [link for link in page_crawl.links if link.type == 'a'] img_links = [link for link in page_crawl.links if link.type == 'img'] script_links = [link for link in page_crawl.links if link.type == 'script'] link_links = [link for link in page_crawl.links if link.type == 'link'] self.assertEqual(5, len(a_links)) self.assertEqual(1, len(img_links)) self.assertEqual(1, len(script_links)) self.assertEqual(1, len(link_links))