Esempio n. 1
0
 def dbw_setup(self, distributed=False):
     settings = Settings()
     settings.MAX_NEXT_REQUESTS = 64
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     if distributed:
         settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend'
     else:
         settings.BACKEND = 'tests.mocks.components.FakeBackend'
     return DBWorker(settings, True, True, False)
Esempio n. 2
0
 def dbw_setup(self, distributed=False):
     settings = Settings()
     settings.MAX_NEXT_REQUESTS = 64
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     if distributed:
         settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend'
     else:
         settings.BACKEND = 'tests.mocks.components.FakeBackend'
     return DBWorker(settings, False, False, False, partitions="0")
Esempio n. 3
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(3)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(3, 5)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Esempio n. 4
0
 def sw_setup_add_seeds(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return StrategyWorker(settings, True)
Esempio n. 5
0
 def sw_setup_filtered_links(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     return StrategyWorker(settings, False)
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
Esempio n. 7
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests)
                for i in range(2, 5)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(2, 5)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(5, 7)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware',
                                'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
                                'tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
 def sw_setup_add_seeds(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return StrategyWorker(settings, True)
Esempio n. 10
0
 def sw_setup_filtered_links(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     return StrategyWorker(settings, False)
Esempio n. 11
0
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy'
     manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return manager.strategy
Esempio n. 13
0
 def setUp(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     self.sw = StrategyWorker(settings, CrawlingStrategy, None, None)