Example #1
0
    def crawl(self):
        worker_init = WorkerInit(self.config.worker_config, self.input_queue,
                                 self.output_queue, self.build_logger())
        self.workers = self.get_workers(self.config, worker_init)

        queue_size = len(self.start_url_splits)
        for start_url_split in self.start_url_splits:
            self.input_queue.put(
                WorkerInput(start_url_split, True, 0, start_url_split.netloc,
                            self.config.content_check), False)

        self.start_workers(self.workers, self.input_queue, self.output_queue)

        self.start_progress()

        while True:
            page_crawl = self.output_queue.get()
            queue_size -= 1
            new_worker_inputs = self.process_page_crawl(page_crawl)

            # We only process new pages if we did not exceed configured depth
            for worker_input in new_worker_inputs:
                queue_size += 1
                self.input_queue.put(worker_input, False)

            self.progress(page_crawl, len(self.site.pages), queue_size)

            if queue_size <= 0:
                self.stop_workers(self.workers, self.input_queue,
                                  self.output_queue)
                self.stop_progress()
                return self.site
Example #2
0
    def test_base_url(self):
        page_crawler, url_split = self.get_page_crawler("/alone.html")
        page_crawl = page_crawler._crawl_page(
            WorkerInput(url_split, True, 0, url_split.netloc))

        self.assertEqual(1, len(page_crawl.links))
        self.assertEqual('http://www.example.com/test.html',
                         page_crawl.links[0].url_split.geturl())
Example #3
0
    def test_crawl_404(self):
        page_crawler, url_split = self.get_page_crawler(
            "/sub/small_image_bad.gif")
        page_crawl = page_crawler._crawl_page(
            WorkerInput(url_split, True, 0, url_split.netloc))

        self.assertEqual(404, page_crawl.status)
        self.assertFalse(page_crawl.links)
        self.assertFalse(page_crawl.is_html)
        self.assertFalse(page_crawl.is_timeout)
        self.assertFalse(page_crawl.is_redirect)
Example #4
0
    def test_crawl_resource(self):
        page_crawler, url_split = self.get_page_crawler("/sub/small_image.gif")
        page_crawl = page_crawler._crawl_page(
            WorkerInput(url_split, True, 0, url_split.netloc))

        self.assertEqual(200, page_crawl.status)
        self.assertFalse(page_crawl.links)
        self.assertFalse(page_crawl.is_html)
        self.assertFalse(page_crawl.is_timeout)
        self.assertFalse(page_crawl.is_redirect)
        self.assertTrue(page_crawl.exception is None)
Example #5
0
    def test_page_crawler(self):
        page_crawler, url_split = self.get_page_crawler("/index.html")
        input_queue = page_crawler.input_queue
        output_queue = page_crawler.output_queue

        input_queue.put(WorkerInput(url_split, True, 0, url_split.netloc))
        input_queue.put(WORK_DONE)
        page_crawler.crawl_page_forever()

        page_crawl = output_queue.get()

        self.assertEqual(200, page_crawl.status)
        self.assertTrue(len(page_crawl.links) > 0)
Example #6
0
    def process_links(self, page_crawl):
        links_to_process = []

        source_url_split = page_crawl.original_url_split
        if page_crawl.final_url_split:
            source_url_split = page_crawl.final_url_split

        for link in page_crawl.links:
            url_split = link.url_split
            if not self.config.should_download(url_split):
                self.logger.debug(
                    "Won't download %s. Is local? %s", url_split,
                    LazyLogParam(lambda: self.config.is_local(url_split)))
                continue

            page_status = self.page_statuses.get(url_split, None)
            page_source = PageSource(source_url_split, link.source_str,
                                     link.target)

            if not page_status:
                # We never encountered this url before
                self.page_statuses[url_split] = PageStatus(
                    PAGE_QUEUED, [page_source])
                should_crawl = self.config.should_crawl(
                    url_split, page_crawl.depth)
                links_to_process.append(
                    WorkerInput(url_split, should_crawl, page_crawl.depth + 1,
                                page_crawl.site_origin,
                                self.config.content_check))
            elif page_status.status == PAGE_CRAWLED:
                # Already crawled. Add source
                if url_split in self.pages:
                    self.pages[url_split].add_sources([page_source])
                else:
                    # TODO the final url is different. need a way to link it...
                    pass
            elif page_status.status == PAGE_QUEUED:
                # Already queued for crawling. Add source.
                page_status.sources.append(page_source)

        return links_to_process
Example #7
0
    def test_crawl_page(self):
        page_crawler, url_split = self.get_page_crawler("/index.html")
        page_crawl = page_crawler._crawl_page(
            WorkerInput(url_split, True, 0, url_split.netloc))

        self.assertEqual(200, page_crawl.status)
        self.assertTrue(page_crawl.is_html)
        self.assertFalse(page_crawl.is_timeout)
        self.assertFalse(page_crawl.is_redirect)
        self.assertTrue(page_crawl.exception is None)

        a_links = [link for link in page_crawl.links if link.type == 'a']
        img_links = [link for link in page_crawl.links if link.type == 'img']
        script_links = [link for link in page_crawl.links
                        if link.type == 'script']
        link_links = [link for link in page_crawl.links if link.type == 'link']

        self.assertEqual(5, len(a_links))
        self.assertEqual(1, len(img_links))
        self.assertEqual(1, len(script_links))
        self.assertEqual(1, len(link_links))