Beispiel #1
0
 def test_build_request(self):
     r = Request.of("http://test")
     self.assertEqual(r.url, "http://test")
     r1 = Request.of(r)
     self.assertIs(r1, r)
     with self.assertRaises(TypeError):
         Request.of(1)
Beispiel #2
0
 def test_recover(self):
     spider = RecoverMySpider()
     spider.filter = spider.filter + CustomFilter(lambda x: True)
     spider.set_session(env.session)
     list(
         run_and_get_result(
             spider.crawl(
                 Request.of("http://localhost:5000/test_extract"))))
     spider.stash(EXE_PATH)
     del spider
     recovered_spider = RecoverMySpider()
     recovered_spider.recover(EXE_PATH)
     self.assertTrue(
         recovered_spider.crawled_filter.contains(
             Request.of("http://localhost:5000/a.html")))
Beispiel #3
0
    def test_recover_queue(self):
        queue = RecoverableRequestQueue()
        queue.put(Request.of("1"))
        queue.put(Request.of("2"))
        queue.put(Request.of("3"))
        queue.stash(EXE_PATH)
        del queue

        recovered_queue = RecoverableRequestQueue()
        recovered_queue.recover(EXE_PATH)
        self.assertFalse(recovered_queue.empty())
        i = 1
        while not recovered_queue.empty():
            request = recovered_queue.get()
            self.assertEqual(request.url, str(i))
            i += 1
Beispiel #4
0
 def test_auto_save(self):
     test_data = {"key": 1}
     spider = MySpider()
     spider.set_session(mock_env.env.session)
     spider.start_targets = ["http://localhost:5000/test_extract"]
     spider.auto_save_frequency = 10
     task = CountDownRecoverableTask(spider)
     task.add_actions(lambda: test_data.clear())
     for _ in range(5):
         mock_env.env.loop.run_until_complete(task.run())
         task._request_queue.put(
             Request.of("http://localhost:5000/test_extract"))
     self.assertEqual(test_data, {"key": 1})
     for _ in range(5):
         mock_env.env.loop.run_until_complete(task.run())
         task._request_queue.put(
             Request.of("http://localhost:5000/test_extract"))
     self.assertEqual(test_data, {})
Beispiel #5
0
 def test_stash(self):
     spider = MySpider()
     spider.start_targets = ["http://localhost:5000/test_extract"]
     task = RecoverableTask(spider)
     exists(".task_stash") or makedirs(".task_stash")
     task.stash(".task_stash")
     del task
     recovered_task = RecoverableTask(MySpider())
     self.assertTrue(recovered_task.can_recover(".task_stash"))
     recovered_task.recover(".task_stash")
     self.assertTrue(
         recovered_task.spider.crawled_filter.contains(
             Request.of("http://localhost:5000/test_extract")))
Beispiel #6
0
 def test_crawled_spider(self):
     spider = MySpider()
     with self.assertRaises(TypeError):
         spider.crawled_filter = spider.filter
     start_request = Request.of("http://localhost:5000/test_extract")
     self.assertFalse(spider.crawled_filter.accept(start_request))
Beispiel #7
0
 def test_reg_filter(self):
     f = URLRegFilter(r"http://")
     self.assertTrue(f.accept(Request.of("http://www.baidu.com")))
Beispiel #8
0
 def assert_false(self, r, url):
     self.assertFalse(r.accept(Request.of(url)))
Beispiel #9
0
 def assert_true(self, r, url):
     self.assertTrue(r.accept(Request.of(url)))
Beispiel #10
0
 def from_url_or_request(url: str):
     return Request.of(url)