Beispiel #1
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests)
                for i in range(2, 5)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(2, 5)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(5, 7)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware',
                                'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
                                'tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def run_add_seeds(settings, seeds_file):
    fh = open(seeds_file, "rb")

    logger.info("Starting local seeds addition from file %s", seeds_file)

    manager = LocalFrontierManager.from_settings(settings)
    manager.add_seeds(fh)
    manager.stop()
    manager.close()

    logger.info("Seeds addition finished")
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware',
                             'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
                             'tests.mocks.components.FakeMiddleware',
                             'tests.mocks.components.FakeMiddlewareModifySeeds',
                             'tests.mocks.components.FakeMiddlewareModifyResponse',
                             'tests.mocks.components.FakeMiddlewareModifyLinks']
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return LocalFrontierManager.from_settings(settings)
Beispiel #5
0
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = [
         'frontera.contrib.middlewares.domain.DomainMiddleware',
         'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
         'tests.mocks.components.FakeMiddleware',
         'tests.mocks.components.FakeMiddlewareModifySeeds',
         'tests.mocks.components.FakeMiddlewareModifyResponse',
         'tests.mocks.components.FakeMiddlewareModifyLinks'
     ]
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return LocalFrontierManager.from_settings(settings)
Beispiel #6
0
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KafkaProducer(self._kafka,
                                       partitioner=Crc32NamePartitioner,
                                       codec=snappy)

        self._in_consumer = KafkaConsumer(self._kafka,
                                          settings.get('FRONTIER_GROUP'),
                                          settings.get('INCOMING_TOPIC'),
                                          buffer_size=1048576,
                                          max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = KafkaConsumer(
                self._kafka,
                settings.get('FRONTIER_GROUP'),
                settings.get('SCORING_TOPIC'),
                buffer_size=262144,
                max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka,
                                       settings.get('OUTGOING_TOPIC'),
                                       settings.get('FRONTIER_GROUP'))

        self._manager = LocalFrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}
Beispiel #7
0
 def get_frontier(self):
     """
     Returns frontierManager object
     """
     return LocalFrontierManager.from_settings(self.get_settings())
Frontier from parameters example
"""
from frontera.utils import graphs
from frontera.core.manager import LocalFrontierManager
from frontera import Request, Response

if __name__ == '__main__':
    # Create graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Create frontier
    frontier = LocalFrontierManager(
        request_model='frontera.core.models.Request',
        response_model='frontera.core.models.Response',
        backend='frontera.contrib.backends.memory.FIFO',
        middlewares=[
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware',
        ],
        test_mode=True)

    # Add seeds
    frontier.add_seeds([Request(seed.url) for seed in graph.seeds])

    # Get next requests
    next_requests = frontier.get_next_requests()

    # Crawl pages
    for request in next_requests:

        # Fake page crawling
Beispiel #9
0
 def get_frontier(self):
     """
     Returns frontierManager object
     """
     return LocalFrontierManager.from_settings(self.get_settings())