Esempio n. 1
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(3)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(3, 5)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Esempio n. 2
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests)
                for i in range(2, 5)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(2, 5)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(5, 7)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware',
                                'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
                                'tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4