def dbw_setup(self, distributed=False): settings = Settings() settings.MAX_NEXT_REQUESTS = 64 settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' if distributed: settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' return DBWorker(settings, True, True, False)
def dbw_setup(self, distributed=False): settings = Settings() settings.MAX_NEXT_REQUESTS = 64 settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' if distributed: settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' return DBWorker(settings, False, False, False, partitions="0")
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def sw_setup_add_seeds(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' return StrategyWorker(settings, True)
def sw_setup_filtered_links(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 return StrategyWorker(settings, False)
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy' manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) return manager.strategy
def setUp(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 self.sw = StrategyWorker(settings, CrawlingStrategy, None, None)