Example #1
0
    def test_queue_get_requests_with_count(self):
        class DummyDownloaderWithWorkers(DummyDownloader):
            def get_workers_count(self):
                return 5

        class SimpleQueue(BaseQueue):
            def __init__(self):
                self.requests = []

            def get_requests(self, count=None):
                # Downloader can fetch only one request at moment
                assert count == 5
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        pomp = Pomp(
            downloader=DummyDownloaderWithWorkers(),
            middlewares=(url_to_request_middl, ),
        )

        # override internal queue with own
        pomp.queue = SimpleQueue()

        pomp.pump(Crawler())
Example #2
0
    def test_urllib_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = UrllibDownloader()

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
Example #3
0
    def test_exception_on_crawler_worker(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            pool_size=2,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        pomp.pump(ConcurrentCrawler(
            pool_size=2,
            worker_class=MockedCrawlerWorkerWithException,
        ))

        assert len(collect_middleware.requests) == 1
        assert len(collect_middleware.exceptions) == 1
Example #4
0
    def test_urllib_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = UrllibDownloader()

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
Example #5
0
    def do_simple_test(self, queue=None):
        req_resp_middleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=TwistedHttpRequest,
        )
        collect_middleware = CollectRequestResponseMiddleware()
        downloader = TwistedDownloader(
            reactor, middlewares=[collect_middleware])

        downloader.middlewares.insert(0, req_resp_middleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[PrintPipeline()],
            queue=queue,
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        done_defer = defer.Deferred()
        d = pomp.pump(Crawler())

        d.add_callback(done_defer.callback)

        def check(x):
            assert_set_equal(
                set([r.url.replace(self.httpd.location, '')
                    for r in collect_middleware.requests]),
                set(self.httpd.sitemap.keys())
            )

        done_defer.addCallback(check)
        return done_defer
Example #6
0
    def test_thread_pooled_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ThreadedDownloader(
            middlewares=[UrllibAdapterMiddleware(), collect_middleware])

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert_set_equal(
            set([
                r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests
            ]), set(self.httpd.sitemap.keys()))
Example #7
0
    def test_queue_get_requests_with_count(self):

        class DummyDownloaderWithWorkers(DummyDownloader):

            def get_workers_count(self):
                return 5

        class SimpleQueue(BaseQueue):

            def __init__(self):
                self.requests = []

            def get_requests(self, count=None):
                # Downloader can fetch only one request at moment
                assert count == 5
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        pomp = Pomp(
            downloader=DummyDownloaderWithWorkers(),
            middlewares=(url_to_request_middl, ),
        )

        # override internal queue with own
        pomp.queue = SimpleQueue()

        pomp.pump(Crawler())
Example #8
0
    def test_concurrent_urllib_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            middlewares=[collect_middleware]
        )

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert_set_equal(
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]),
            set(self.httpd.sitemap.keys())
        )
Example #9
0
    def test_exception_on_downloader_worker(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url='http://localhost',
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentDownloader(
            pool_size=5,
            worker_class=MockedDownloadWorkerWithException,
            worker_kwargs=None,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(req_resp_midlleware, collect_middleware, ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert len(collect_middleware.requests) == 1
        assert len(collect_middleware.exceptions) == 1
Example #10
0
    def test_concurrent_crawler(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            pool_size=2,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(
                req_resp_midlleware,
                UrllibAdapterMiddleware(),
                collect_middleware,
            ),
            pipelines=[],
        )

        pomp.pump(ConcurrentCrawler(
            pool_size=2,
            worker_class=MockedCrawlerWorker,
        ))

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
Example #11
0
    def test_concurrent_urllib_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            middlewares=[collect_middleware])

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert_set_equal(
            set([
                r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests
            ]), set(self.httpd.sitemap.keys()))
Example #12
0
    def test_concurrent_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url='http://localhost',
            request_factory=UrllibHttpRequest,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentDownloader(
            pool_size=5,
            worker_class=MockedDownloadWorker,
            worker_kwargs=None,
        )

        pomp = Pomp(
            downloader=downloader,
            middlewares=(req_resp_midlleware, collect_middleware, ),
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert \
            set([r.url.replace('http://localhost', '')
                for r in collect_middleware.requests]) == \
            set(MockedDownloadWorker.sitemap.keys())
Example #13
0
    def test_thread_pooled_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ThreadedDownloader(
            middlewares=[UrllibAdapterMiddleware(), collect_middleware]
        )

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        pomp.pump(Crawler())

        assert_set_equal(
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]),
            set(self.httpd.sitemap.keys())
        )
Example #14
0
    def test_exception_handling(self):
        class CatchException(BaseMiddleware):
            def __init__(self):
                self.exceptions = []

            def process_exception(self, exception, crawler, downloader):
                self.exceptions.append(exception)
                return exception

        class MockCrawler(BaseCrawler):
            def next_requests(self, response):
                return

            def extract_items(self, response):
                return

        catch_exception_middleware = CatchException()
        pomp = Pomp(
            downloader=UrllibDownloader(),
            middlewares=(
                UrllibAdapterMiddleware(),
                catch_exception_middleware,
            ),
            pipelines=[],
        )

        MockCrawler.ENTRY_REQUESTS = [
            'https://123.456.789.01:8081/fake_url',
            '%s/root' % self.httpd.location,
        ]

        pomp.pump(MockCrawler())

        assert len(catch_exception_middleware.exceptions) == 1
Example #15
0
    def test_exceptions(self):

        req_resp_middleware = RequestResponseMiddleware(
            prefix_url='invalid url',
            request_factory=TwistedHttpRequest,
        )
        collect_middleware = CollectRequestResponseMiddleware()

        downloader = TwistedDownloader(
            reactor, middlewares=[collect_middleware])

        downloader.middlewares.insert(0, req_resp_middleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[PrintPipeline()],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        done_defer = defer.Deferred()
        d = pomp.pump(Crawler())

        d.add_callback(done_defer.callback)

        def check(x):
            assert len(collect_middleware.exceptions) == 1
            assert isinstance(
                collect_middleware.exceptions[0], BaseDownloadException)

        done_defer.addCallback(check)
        return done_defer
Example #16
0
    def test_queue_crawler(self):
        road = RoadPipeline()

        class SimpleQueue(BaseQueue):

            def __init__(self):
                self.requests = []

            def get_requests(self):
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        queue = SimpleQueue()

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                road,
            ],
            queue=queue,
        )

        pomp.pump(Crawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]))
Example #17
0
def test_exception_on_processing_response():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(downloader=DummyDownloader(middlewares=[RaiseOnResponseMiddleware(), collect_middleware]))

    pomp.pump(Crawler())

    assert_equal(len(collect_middleware.exceptions), 1)
    assert_equal(len(collect_middleware.requests), 1)
    assert_equal(len(collect_middleware.responses), 1)
Example #18
0
def crawler_worker(crawler_class, source_queue, stop_event):
    pid = os.getpid()
    log.debug('Start crawler worker: %s', pid)
    pomp = Pomp(
        downloader=UrllibDownloader(timeout=3),
        pipelines=[],
        queue=WrappedQueue(source_queue, stop_event),
    )
    pomp.pump(crawler_class())
    log.debug('Stop crawler worker: %s', pid)
    return True
Example #19
0
def crawler_worker(crawler_class, source_queue, stop_event):
    pid = os.getpid()
    log.debug('Start crawler worker: %s', pid)
    pomp = Pomp(
        downloader=dnl(timeout=3),
        pipelines=[],
        queue=WrappedQueue(source_queue, stop_event),
    )
    pomp.pump(crawler_class())
    log.debug('Stop crawler worker: %s', pid)
    return True
Example #20
0
def test_exception_on_processing_response_callback():
    class CrawlerWithException(Crawler):
        def extract_items(self, *args, **kwargs):
            raise Exception("some exception on extract items")

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(downloader=DummyDownloader(middlewares=[collect_middleware]))

    pomp.pump(CrawlerWithException())

    assert_equal(len(collect_middleware.exceptions), 1)
    assert_equal(len(collect_middleware.requests), 1)
    assert_equal(len(collect_middleware.responses), 1)
Example #21
0
def test_exception_on_processing_response():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(downloader=DummyDownloader(middlewares=[
        RaiseOnResponseMiddleware(),
        collect_middleware,
    ]), )

    pomp.pump(Crawler())

    assert_equal(len(collect_middleware.exceptions), 1)
    assert_equal(len(collect_middleware.requests), 1)
    assert_equal(len(collect_middleware.responses), 1)
Example #22
0
    def test_pipeline_exception(self):
        class PipelineWithException(BasePipeline):
            def process(self, crawler, item):
                raise RuntimeError("some exception")

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                PipelineWithException(),
            ],
        )

        pomp.pump(Crawler())
Example #23
0
def test_exception_on_processing_response_callback():
    class CrawlerWithException(Crawler):
        def extract_items(self, *args, **kwargs):
            raise Exception("some exception on extract items")

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(downloader=DummyDownloader(),
                middlewares=(collect_middleware, ))

    pomp.pump(CrawlerWithException())

    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 1
    assert len(collect_middleware.responses) == 1
Example #24
0
    def test_crawler_return_none(self):
        class CrawlerWithoutItems(BaseCrawler):
            ENTRY_REQUESTS = 'http://localhost/'

            def extract_items(self, *args, **kwargs):
                pass

            def next_requests(self, *args, **kwargs):
                pass

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
        )
        pomp.pump(CrawlerWithoutItems())
Example #25
0
    def test_crawler_return_none(self):

        class CrawlerWithoutItems(BaseCrawler):
            ENTRY_REQUESTS = 'http://localhost/'

            def extract_items(self, *args, **kwargs):
                pass

            def next_requests(self, *args, **kwargs):
                pass

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
        )
        pomp.pump(CrawlerWithoutItems())
Example #26
0
def test_exception_on_processing_request():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RaiseOnRequestMiddleware(),
            collect_middleware,
        ),
    )

    pomp.pump(Crawler())

    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 0
    assert len(collect_middleware.responses) == 0
Example #27
0
def test_exception_on_processing_request():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RaiseOnRequestMiddleware(),
            collect_middleware,
        ),
    )

    pomp.pump(Crawler())

    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 0
    assert len(collect_middleware.responses) == 0
Example #28
0
    def test_pipeline_exception(self):

        class PipelineWithException(BasePipeline):

            def process(self, crawler, item):
                raise RuntimeError("some exception")

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                PipelineWithException(),
            ],
        )

        pomp.pump(Crawler())
Example #29
0
    def test_crawler_without_next_request_method_result(self):

        class CrawlerWithoutNextRequestMethod(Crawler):
            def next_requests(self, *args, **kwargs):
                pass

        road = RoadPipeline()
        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                road,
            ],
        )
        pomp.pump(CrawlerWithoutNextRequestMethod())
        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/1/trash',
        ]))
Example #30
0
def test_exception_on_processing_exception():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RaiseOnRequestMiddleware(),
            RaiseOnExceptionMiddleware(),
            collect_middleware,
        ),
    )

    pomp.pump(Crawler())

    # one exception on request middleware plus one on exception processing
    assert len(collect_middleware.exceptions) == 1 + 1
    assert len(collect_middleware.requests) == 0
    assert len(collect_middleware.responses) == 0
Example #31
0
def test_exception_on_processing_done():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RequestResponseMiddleware(request_factory=DummyRequest,
                                      bodyjson=False),
            collect_middleware,
        ),
    )

    pomp.pump(RaiseOnProcessingDoneCralwer())

    # one exception on request middleware plus one on exception processing
    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 1
    assert len(collect_middleware.responses) == 1
Example #32
0
def test_exception_on_processing_exception():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RaiseOnRequestMiddleware(),
            RaiseOnExceptionMiddleware(),
            collect_middleware,
        ),
    )

    pomp.pump(Crawler())

    # one exception on request middleware plus one on exception processing
    assert len(collect_middleware.exceptions) == 1 + 1
    assert len(collect_middleware.requests) == 0
    assert len(collect_middleware.responses) == 0
Example #33
0
    def test_crawler_without_next_request_method_result(self):
        class CrawlerWithoutNextRequestMethod(Crawler):
            def next_requests(self, *args, **kwargs):
                pass

        road = RoadPipeline()
        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
        )
        pomp.pump(CrawlerWithoutNextRequestMethod())
        assert set([item.url for item in road.collection]) == set([
            'http://python.org/1',
            'http://python.org/1/trash',
        ])
    def test_concurrent_crawler(self):
        req_resp_midlleware = RequestResponseMiddleware(prefix_url=self.httpd.location, request_factory=lambda x: x)

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentUrllibDownloader(
            pool_size=2, middlewares=[UrllibAdapterMiddleware(), collect_middleware]
        )

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(downloader=downloader, pipelines=[])

        pomp.pump(ConcurrentCrawler(pool_size=2, worker_class=MockedCrawlerWorker))

        assert_set_equal(
            set([r.url.replace(self.httpd.location, "") for r in collect_middleware.requests]),
            set(self.httpd.sitemap.keys()),
        )
Example #35
0
    def test_pipeline(self):

        class IncPipeline(BasePipeline):

            def process(self, crawler, item):
                item.value += 1
                return item

        class FilterPipeline(BasePipeline):

            def process(self, crawler, item):
                if 'trash' in item.url:
                    return None
                return item

        class SavePipeline(BasePipeline):

            def __init__(self, collection):
                self.collection = collection

            def process(self, crawler, item):
                self.collection.append(item)
                return item

        result = []

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                IncPipeline(),
                FilterPipeline(),
                SavePipeline(result),
            ],
        )

        pomp.pump(Crawler())

        assert [(item.url, item.value) for item in result] == [
            ('http://python.org/1', 2),
            ('http://python.org/2', 2),
        ]
Example #36
0
def test_exception_on_processing_done():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RequestResponseMiddleware(
                request_factory=DummyRequest,
                bodyjson=False
            ),
            collect_middleware,
        ),
    )

    pomp.pump(RaiseOnProcessingDoneCralwer())

    # one exception on request middleware plus one on exception processing
    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 1
    assert len(collect_middleware.responses) == 1
Example #37
0
    def test_pipeline(self):

        class IncPipeline(BasePipeline):

            def process(self, crawler, item):
                item.value += 1
                return item

        class FilterPipeline(BasePipeline):

            def process(self, crawler, item):
                if 'trash' in item.url:
                    return None
                return item

        class SavePipeline(BasePipeline):

            def __init__(self, collection):
                self.collection = collection

            def process(self, crawler, item):
                self.collection.append(item)
                return item

        result = []

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                IncPipeline(),
                FilterPipeline(),
                SavePipeline(result),
            ],
        )

        pomp.pump(Crawler())

        assert_equal([(item.url, item.value) for item in result], [
            ('http://python.org/1', 2),
            ('http://python.org/2', 2),
        ])
Example #38
0
    def test_crawler_dive_methods(self):
        road = RoadPipeline()

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                road,
            ],
        )

        # Depth first method
        pomp.pump(Crawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]))

        # Width first method
        road.reset()

        class DummyWidthCrawler(Crawler):
            CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD

        pomp.pump(DummyWidthCrawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/2',
            'http://python.org/1/trash',
        ]))
Example #39
0
    def test_timeout(self):

        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=TwistedHttpRequest,
        )
        collect_middleware = CollectRequestResponseMiddleware()

        downloader = TwistedDownloader(
            reactor,
            timeout=0.5,
            middlewares=[collect_middleware]
        )

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(
            downloader=downloader,
            pipelines=[PrintPipeline()],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/sleep'

        done_defer = defer.Deferred()
        d = pomp.pump(Crawler())

        d.add_callback(done_defer.callback)

        def check(x):
            assert len(collect_middleware.exceptions) == 1
            e = collect_middleware.exceptions[0]
            assert isinstance(e, BaseDownloadException)

            # twisted _newcleint can raise ResponseNeverReceived
            # next assert works only for `oldclient`
            # assert isinstance(e.exception, defer.CancelledError)

        done_defer.addCallback(check)
        return done_defer
    def test_concurrent_downloader(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url="http://localhost", request_factory=UrllibHttpRequest
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentDownloader(
            pool_size=5, worker_class=MockedDownloadWorker, worker_kwargs=None, middlewares=(collect_middleware,)
        )
        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(downloader=downloader, pipelines=[])

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = "/root"

        pomp.pump(Crawler())

        assert_set_equal(
            set([r.url.replace("http://localhost", "") for r in collect_middleware.requests]),
            set(MockedDownloadWorker.sitemap.keys()),
        )
Example #41
0
    def test_queue_crawler(self):
        road = RoadPipeline()

        class SimpleQueue(BaseQueue):

            def __init__(self):
                self.requests = []

            def get_requests(self):
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        queue = SimpleQueue()

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                road,
            ],
            queue=queue,
        )

        class DummyWidthCrawler(Crawler):
            CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD

        pomp.pump(DummyWidthCrawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]))
Example #42
0
    def test_queue_crawler(self):
        road = RoadPipeline()

        class SimpleQueue(BaseQueue):

            def __init__(self):
                self.requests = []

            def get_requests(self, count=None):
                # because downloader without workers
                assert count is None
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
        )

        # override internal queue with own
        pomp.queue = SimpleQueue()

        pomp.pump(Crawler())

        assert set([item.url for item in road.collection]) == set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ])
    def test_exception_on_downloader_worker(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url="http://localhost", request_factory=UrllibHttpRequest
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = ConcurrentDownloader(
            pool_size=5,
            worker_class=MockedDownloadWorkerWithException,
            worker_kwargs=None,
            middlewares=(collect_middleware,),
        )
        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = Pomp(downloader=downloader, pipelines=[])

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = "/root"

        pomp.pump(Crawler())

        assert_equal(len(collect_middleware.requests), 1)
        assert_equal(len(collect_middleware.exceptions), 1)
Example #44
0
    def test_exception_handling(self):

        class CatchException(BaseMiddleware):

            def __init__(self):
                self.exceptions = []

            def process_exception(self, exception, crawler, downloader):
                self.exceptions.append(exception)
                return exception

        class MockCrawler(BaseCrawler):
            def next_requests(self, response):
                return

            def extract_items(self, response):
                return

        catch_exception_middleware = CatchException()
        pomp = Pomp(
            downloader=UrllibDownloader(),
            middlewares=(
                UrllibAdapterMiddleware(),
                catch_exception_middleware,
            ),
            pipelines=[],
        )

        MockCrawler.ENTRY_REQUESTS = [
            'https://123.456.789.01:8081/fake_url',
            '%s/root' % self.httpd.location,
        ]

        pomp.pump(MockCrawler())

        assert len(catch_exception_middleware.exceptions) == 1
Example #45
0
    def test_queue_crawler(self):
        road = RoadPipeline()

        class SimpleQueue(BaseQueue):
            def __init__(self):
                self.requests = []

            def get_requests(self, count=None):
                # because downloader without workers
                assert count is None
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
        )

        # override internal queue with own
        pomp.queue = SimpleQueue()

        pomp.pump(Crawler())

        assert set([item.url for item in road.collection]) == set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ])
Example #46
0
    def test_dive_methods(self, crawler_class=None):
        crawler_class = crawler_class or Crawler
        road = RoadPipeline()

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
            breadth_first=False,
        )

        # Depth first method
        pomp.pump(crawler_class())

        log.debug("in road %s", [item.url for item in road.collection])
        assert [item.url for item in road.collection] == [
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]

        # Width first method
        road.reset()

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
            breadth_first=True,
        )
        pomp.pump(crawler_class())

        log.debug("in road %s", [item.url for item in road.collection])
        assert [item.url for item in road.collection] == [
            'http://python.org/1',
            'http://python.org/2',
            'http://python.org/1/trash',
        ]
Example #47
0
    def test_dive_methods(self, crawler_class=None):
        crawler_class = crawler_class or Crawler
        road = RoadPipeline()

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
            breadth_first=False,
        )

        # Depth first method
        pomp.pump(crawler_class())

        log.debug("in road %s", [item.url for item in road.collection])
        assert [item.url for item in road.collection] == [
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]

        # Width first method
        road.reset()

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
            breadth_first=True,
        )
        pomp.pump(crawler_class())

        log.debug("in road %s", [item.url for item in road.collection])
        assert [item.url for item in road.collection] == [
            'http://python.org/1',
            'http://python.org/2',
            'http://python.org/1/trash',
        ]
Example #48
0
# so we need to protect the main code with 'if __name__ == '__main__':'
# to avoid creating subprocesses recursively:
if __name__ == '__main__':
    pool_size = 2
    start_time = datetime.now()
    # start phantomjs nodes
    ph_drivers = deque(
        [
            webdriver.PhantomJS() for _ in range(pool_size)
            ]
    )
    # Grab URLs of all cities
    city_pomp = Pomp(
        downloader=PhantomDownloader(
            pool_size=2,
            worker_class=PhantomWorker,
            phantom_drivers=ph_drivers,
        ),
        pipelines=[PhobiaCityPipeline()]
    )
    city_pomp.pump(PhobiaCityCrawler())

    engine = db_connect()
    Session = sessionmaker(bind=engine)
    se = Session()
    all_cities = [(city.id, city.url) for city in se.query(PhobiaCity).all()]
    se.close()

    statistics = StatisticMiddleware()

    for city_id, city_url in all_cities:
        quest_pomp = Pomp(
Example #49
0
        # extract next urls
        for link in response.tree.xpath(self.NEXT_URLS_XPATH):
            yield UrllibHttpRequest(urljoin(self.BASE_URL, link.get('href')))


if __name__ == '__main__':
    from pomp.core.engine import Pomp
    from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader

    statistics = StatisticMiddleware()
    middlewares = (
        statistics,
        LXMLDownloaderMiddleware(encoding='utf-8'),
    )

    filepath = os.path.join(tempfile.gettempdir(), 'quotes.csv')
    pomp = Pomp(
        downloader=ConcurrentUrllibDownloader(
            pool_size=3,
        ),
        middlewares=middlewares,
        pipelines=(
            PrintPipeline(),
            CsvPipeline(filepath, delimiter=';', quotechar='"'),
        ),
    )

    pomp.pump(QuotesSpider())
    print("Statistics:\n %s" % statistics)
Example #50
0
        if self._next_requests:
            yield self._next_requests.pop()


if __name__ == '__main__':
    from pomp.core.engine import Pomp

    try:
        from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader \
            as dnl
    except ImportError:
        from pomp.contrib.urllibtools import ThreadedDownloader as dnl

    statistics = StatisticMiddleware()
    middlewares = (
        statistics,
        LXMLDownloaderMiddleware(encoding='utf-8'),
    )

    filepath = os.path.join(tempfile.gettempdir(), 'dmoz.csv')
    pomp = Pomp(
        downloader=dnl(middlewares=middlewares, timeout=10),
        pipelines=[
            PrintPipeline(),
            CsvPipeline(filepath, delimiter=';', quotechar='"'),
        ],
    )

    pomp.pump(DmozSpider())
    print("Statistics:\n", statistics)
Example #51
0
        for request in iterator(requests):
            response = self._fetch(request)
            responses.append(response)
        return responses

    def _fetch(self, request):
        try:
            res = requestslib.get(request.url)
            return ReqResponse(request, res)
        except Exception as e:
            print('Exception on %s: %s', request, e)
            return BaseDownloadException(request, exception=e)


if __name__ == '__main__':
    from pomp.core.base import BaseCrawler
    from pomp.core.engine import Pomp

    class Crawler(BaseCrawler):
        ENTRY_REQUESTS = ReqRequest('http://python.org/news/')

        def extract_items(self, response):
            print(response.body)

        def next_requests(self, response):
            return None  # one page crawler

    pomp = Pomp(downloader=RequestsDownloader(), )

    pomp.pump(Crawler())
Example #52
0
            # follow to the first two persons from `also likes`
            for href in response.tree.xpath(self.ALSO_LIKES_LINKS_XPATH)[:2]:
                # do not repeat requests
                url = urljoin(self.BASE_URL, href)
                if url not in self._parsed_also_likes:
                    yield PhantomRequest(url, level=level + 1)


if __name__ == '__main__':
    from pomp.core.engine import Pomp
    from e02_quotes import (
        PrintPipeline,
        LXMLDownloaderMiddleware,
        StatisticMiddleware,
    )

    statistics = StatisticMiddleware()
    pomp = Pomp(
        downloader=PhantomDownloader(
            pool_size=2,
            worker_class=PhantomDownloadWorker,
        ),
        middlewares=(
            statistics,
            LXMLDownloaderMiddleware(),
        ),
        pipelines=[PrintPipeline()],
    )
    pomp.pump(TwitterSpider())
    print("Statistics:\n %s" % statistics)
Example #53
0
import re
from pomp.core.base import BaseCrawler
from pomp.contrib.urllibtools import UrllibHttpRequest


python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M)


class MyCrawler(BaseCrawler):
    """Extract all sentences with `python` word"""
    ENTRY_REQUESTS = UrllibHttpRequest('http://python.org/news')  # entry point

    def extract_items(self, response):
        for i in python_sentence_re.findall(response.body.decode('utf-8')):
            sentence = i.strip()
            print("Sentence: {}".format(sentence))
            yield sentence


if __name__ == '__main__':
    from pomp.core.engine import Pomp
    from pomp.contrib.urllibtools import UrllibDownloader

    pomp = Pomp(
        downloader=UrllibDownloader(),
    )

    pomp.pump(MyCrawler())