Ejemplo n.º 1
0
                def try_again_tomorrow():
                    cd = CrawlDocument.objects(
                        url_hash=hash_url(crawl_result.url)).first()

                    # try again tomorrow
                    self.log.info("Try again tomorrow: {}".format(
                        crawl_result.url))
                    if cd is not None and 'url' in cd.latest_request:
                        self.send_crawl_request(cd.latest_request,
                                                timeout=now()+timedelta(days=1)
                                                )
                    else:
                        self.send_crawl_request(
                            crawl_result.crawl_task.to_dict(),
                            timeout=now()+timedelta(days=1)
                        )
Ejemplo n.º 2
0
    def test_status_codes_gt_200(self, objects, declare_queue, sync,
                                 cdobjects):


        inst = objects.return_value
        inst.first.return_value = self.host

        declare_queue.side_effect = declare_queue_side_effect

        crawl_response = {
            "url": "http://example.com/home.html",
            "body": "<html></html>",
            "actions": ["index"],
            "status_code": 500,
            "headers": {
                "content-type": "text/html"
            },
            "crawl_time": datetime.now(timezone.utc).isoformat()
        }

        now = datetime.now(timezone.utc)

        cd = CrawlDocument()
        cd.host = "example.com"
        cd.url = crawl_response['url']
        cd.url_hash = hash_url(crawl_response['url'])
        cd.latest_request = {
            "url": crawl_response['url'],
            "cookies": {},
            "method": "GET",
            "actions": ["follow", "index"]
        }
        cd.save()
        cinst = cdobjects.return_value
        cinst.first.return_value = cd

        self.crawl_manager.process_task(
            ujson.dumps(crawl_response).encode("utf8")
        )

        crawl_response['status_code'] = 302

        self.crawl_manager.process_task(
            ujson.dumps(crawl_response).encode("utf8")
        )

        self.assertEqual(sync.call_count, 2)
Ejemplo n.º 3
0
    def test_crawl_document(self):

        cd = CrawlDocument(url="http://example.com", host="example.com")
        cd.save()
        self.assertEqual(cd.url_hash, hash_url(cd.url))