def test_urllib_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = UrllibDownloader() pomp = Pomp( downloader=downloader, middlewares=( req_resp_midlleware, UrllibAdapterMiddleware(), collect_middleware, ), pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert \ set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]) == \ set(self.httpd.sitemap.keys())
def test_concurrent_urllib_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( middlewares=[collect_middleware]) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert_set_equal( set([ r.url.replace(self.httpd.location, '') for r in collect_middleware.requests ]), set(self.httpd.sitemap.keys()))
def test_concurrent_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url='http://localhost', request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentDownloader( pool_size=5, worker_class=MockedDownloadWorker, worker_kwargs=None, ) pomp = Pomp( downloader=downloader, middlewares=(req_resp_midlleware, collect_middleware, ), pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert \ set([r.url.replace('http://localhost', '') for r in collect_middleware.requests]) == \ set(MockedDownloadWorker.sitemap.keys())
def test_concurrent_crawler(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( pool_size=2, ) pomp = Pomp( downloader=downloader, middlewares=( req_resp_midlleware, UrllibAdapterMiddleware(), collect_middleware, ), pipelines=[], ) pomp.pump(ConcurrentCrawler( pool_size=2, worker_class=MockedCrawlerWorker, )) assert \ set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]) == \ set(self.httpd.sitemap.keys())
def test_exception_on_crawler_worker(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( pool_size=2, ) pomp = Pomp( downloader=downloader, middlewares=( req_resp_midlleware, UrllibAdapterMiddleware(), collect_middleware, ), pipelines=[], ) pomp.pump(ConcurrentCrawler( pool_size=2, worker_class=MockedCrawlerWorkerWithException, )) assert len(collect_middleware.requests) == 1 assert len(collect_middleware.exceptions) == 1
def test_exception_on_downloader_worker(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url='http://localhost', request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentDownloader( pool_size=5, worker_class=MockedDownloadWorkerWithException, worker_kwargs=None, ) pomp = Pomp( downloader=downloader, middlewares=(req_resp_midlleware, collect_middleware, ), pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert len(collect_middleware.requests) == 1 assert len(collect_middleware.exceptions) == 1
def test_exceptions(self): req_resp_middleware = RequestResponseMiddleware( prefix_url='invalid url', request_factory=TwistedHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = TwistedDownloader( reactor, middlewares=[collect_middleware]) downloader.middlewares.insert(0, req_resp_middleware) pomp = Pomp( downloader=downloader, pipelines=[PrintPipeline()], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' done_defer = defer.Deferred() d = pomp.pump(Crawler()) d.add_callback(done_defer.callback) def check(x): assert len(collect_middleware.exceptions) == 1 assert isinstance( collect_middleware.exceptions[0], BaseDownloadException) done_defer.addCallback(check) return done_defer
def do_simple_test(self, queue=None): req_resp_middleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=TwistedHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = TwistedDownloader( reactor, middlewares=[collect_middleware]) downloader.middlewares.insert(0, req_resp_middleware) pomp = Pomp( downloader=downloader, pipelines=[PrintPipeline()], queue=queue, ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' done_defer = defer.Deferred() d = pomp.pump(Crawler()) d.add_callback(done_defer.callback) def check(x): assert_set_equal( set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]), set(self.httpd.sitemap.keys()) ) done_defer.addCallback(check) return done_defer
def test_thread_pooled_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ThreadedDownloader( middlewares=[UrllibAdapterMiddleware(), collect_middleware]) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert_set_equal( set([ r.url.replace(self.httpd.location, '') for r in collect_middleware.requests ]), set(self.httpd.sitemap.keys()))
def setup(self): self.req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) self.collect_middleware = CollectRequestResponseMiddleware() self.downloader = AiohttpDownloader() self.middlewares = ( self.req_resp_midlleware, AiohttpAdapterMiddleware(), self.collect_middleware, )
def test_timeout(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=TwistedHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = TwistedDownloader( reactor, timeout=0.5, middlewares=[collect_middleware] ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[PrintPipeline()], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/sleep' done_defer = defer.Deferred() d = pomp.pump(Crawler()) d.add_callback(done_defer.callback) def check(x): assert len(collect_middleware.exceptions) == 1 e = collect_middleware.exceptions[0] assert isinstance(e, BaseDownloadException) # twisted _newcleint can raise ResponseNeverReceived # next assert works only for `oldclient` # assert isinstance(e.exception, defer.CancelledError) done_defer.addCallback(check) return done_defer