コード例 #1
0
ファイル: test_contrib_urllib.py プロジェクト: zanachka/pomp
    def test_urllib_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = UrllibDownloader()

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
コード例 #2
0
    def test_concurrent_urllib_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            middlewares=[collect_middleware])

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert_set_equal(
            set([
                r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests
            ]), set(self.httpd.sitemap.keys()))
コード例 #3
0
    def test_concurrent_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url='http://localhost',
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentDownloader(
            pool_size=5,
            worker_class=MockedDownloadWorker,
            worker_kwargs=None,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(req_resp_midlleware, collect_middleware, ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert \
            set([r.url.replace('http://localhost', '')
                for r in collect_middleware.requests]) == \
            set(MockedDownloadWorker.sitemap.keys())
コード例 #4
0
    def test_concurrent_crawler(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            pool_size=2,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        pomp.pump(ConcurrentCrawler(
            pool_size=2,
            worker_class=MockedCrawlerWorker,
        ))

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
コード例 #5
0
    def test_exception_on_crawler_worker(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            pool_size=2,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        pomp.pump(ConcurrentCrawler(
            pool_size=2,
            worker_class=MockedCrawlerWorkerWithException,
        ))

        assert len(collect_middleware.requests) == 1
        assert len(collect_middleware.exceptions) == 1
コード例 #6
0
    def test_exception_on_downloader_worker(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url='http://localhost',
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentDownloader(
            pool_size=5,
            worker_class=MockedDownloadWorkerWithException,
            worker_kwargs=None,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(req_resp_midlleware, collect_middleware, ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert len(collect_middleware.requests) == 1
        assert len(collect_middleware.exceptions) == 1
コード例 #7
0
ファイル: test_contrib_twisted.py プロジェクト: sibelius/pomp
    def test_exceptions(self):

        req_resp_middleware = RequestResponseMiddleware(
            prefix_url='invalid url',
            request_factory=TwistedHttpRequest,
        )
        collect_middleware = CollectRequestResponseMiddleware()

        downloader = TwistedDownloader(
            reactor, middlewares=[collect_middleware])

        downloader.middlewares.insert(0, req_resp_middleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[PrintPipeline()],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        done_defer = defer.Deferred()
        d = pomp.pump(Crawler())

        d.add_callback(done_defer.callback)

        def check(x):
            assert len(collect_middleware.exceptions) == 1
            assert isinstance(
                collect_middleware.exceptions[0], BaseDownloadException)

        done_defer.addCallback(check)
        return done_defer
コード例 #8
0
ファイル: test_contrib_twisted.py プロジェクト: sibelius/pomp
    def do_simple_test(self, queue=None):
        req_resp_middleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=TwistedHttpRequest,
        )
        collect_middleware = CollectRequestResponseMiddleware()
        downloader = TwistedDownloader(
            reactor, middlewares=[collect_middleware])

        downloader.middlewares.insert(0, req_resp_middleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[PrintPipeline()],
            queue=queue,
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        done_defer = defer.Deferred()
        d = pomp.pump(Crawler())

        d.add_callback(done_defer.callback)

        def check(x):
            assert_set_equal(
                set([r.url.replace(self.httpd.location, '')
                    for r in collect_middleware.requests]),
                set(self.httpd.sitemap.keys())
            )

        done_defer.addCallback(check)
        return done_defer
コード例 #9
0
ファイル: test_contrib_urllib.py プロジェクト: sibelius/pomp
    def test_thread_pooled_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ThreadedDownloader(
            middlewares=[UrllibAdapterMiddleware(), collect_middleware])

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert_set_equal(
            set([
                r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests
            ]), set(self.httpd.sitemap.keys()))
コード例 #10
0
    def setup(self):
        self.req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        self.collect_middleware = CollectRequestResponseMiddleware()

        self.downloader = AiohttpDownloader()

        self.middlewares = (
            self.req_resp_midlleware,
            AiohttpAdapterMiddleware(),
            self.collect_middleware,
        )
コード例 #11
0
ファイル: test_contrib_twisted.py プロジェクト: sibelius/pomp
    def test_timeout(self):

        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=TwistedHttpRequest,
        )
        collect_middleware = CollectRequestResponseMiddleware()

        downloader = TwistedDownloader(
            reactor,
            timeout=0.5,
            middlewares=[collect_middleware]
        )

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[PrintPipeline()],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/sleep'

        done_defer = defer.Deferred()
        d = pomp.pump(Crawler())

        d.add_callback(done_defer.callback)

        def check(x):
            assert len(collect_middleware.exceptions) == 1
            e = collect_middleware.exceptions[0]
            assert isinstance(e, BaseDownloadException)

            # twisted _newcleint can raise ResponseNeverReceived
            # next assert works only for `oldclient`
            # assert isinstance(e.exception, defer.CancelledError)

        done_defer.addCallback(check)
        return done_defer