def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4