Esempio n. 1
0
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)
Esempio n. 2
0
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
Esempio n. 3
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(3)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(3, 5)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Esempio n. 4
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {}
        self.job_id = 0
        self.task = LoopingCall(self.work)
Esempio n. 5
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder,
                                              self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager,
                                                   self.update_score,
                                                   self.states_context)
        self.states = self._manager.backend.states
        self.stats = {'consumed_since_start': 0}
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        logger.info(
            "Strategy worker is initialized and consuming partition %d",
            partition_id)
Esempio n. 6
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path+".Encoder")
        decoder_cls = load_object(codec_path+".Decoder")
        self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model)
        self._encoder = encoder_cls(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {
            'consumed_since_start': 0
        }
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        self._flush_states_task = LoopingCall(self.flush_states)
        logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
Esempio n. 7
0
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None,
                                                       type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get(
            'QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring,
                         no_batches, self.strategy_enabled,
                         settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {}
Esempio n. 8
0
    def __init__(self, settings, strategy_module):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_module.CrawlingStrategy()
        self.states = self._manager.backend.states
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
        self.task = LoopingCall(self.work)
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
Esempio n. 10
0
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KeyedProducer(self._kafka, partitioner=Crc32NamePartitioner, codec=CODEC_SNAPPY)

        self._in_consumer = SimpleConsumer(self._kafka,
                                       settings.get('FRONTIER_GROUP'),
                                       settings.get('INCOMING_TOPIC'),
                                       buffer_size=1048576,
                                       max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = SimpleConsumer(self._kafka,
                                           settings.get('FRONTIER_GROUP'),
                                           settings.get('SCORING_TOPIC'),
                                           buffer_size=262144,
                                           max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka, settings.get('OUTGOING_TOPIC'), settings.get('FRONTIER_GROUP'))

        self._manager = FrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}
Esempio n. 11
0
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware',
                             'tests.mocks.components.FakeMiddlewareModifySeeds',
                             'tests.mocks.components.FakeMiddlewareModifyResponse',
                             'tests.mocks.components.FakeMiddlewareModifyLinks']
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     return FrontierManager.from_settings(settings)
Esempio n. 13
0
 def __init__(self, settings):
     assert self.request_converter_class, 'request_converter_class not defined'
     assert self.response_converter_class, 'response_converter_class not defined'
     assert issubclass(self.request_converter_class, BaseRequestConverter), 'request_converter_class ' \
                                                                            'must subclass RequestConverter'
     assert issubclass(self.response_converter_class, BaseResponseConverter), 'response_converter_class ' \
                                                                              'must subclass RequestConverter'
     self.request_converter = self.request_converter_class
     self.response_converter = self.response_converter_class
     self.manager = FrontierManager.from_settings(settings)
Esempio n. 14
0
 def __init__(self, settings):
     assert self.request_converter_class, 'request_converter_class not defined'
     assert self.response_converter_class, 'response_converter_class not defined'
     assert issubclass(self.request_converter_class, BaseRequestConverter), 'request_converter_class ' \
                                                                            'must subclass RequestConverter'
     assert issubclass(self.response_converter_class, BaseResponseConverter), 'response_converter_class ' \
                                                                              'must subclass RequestConverter'
     self.request_converter = self.request_converter_class
     self.response_converter = self.response_converter_class
     self.manager = FrontierManager.from_settings(settings)
Esempio n. 15
0
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = [
         'tests.mocks.components.FakeMiddleware',
         'tests.mocks.components.FakeMiddlewareModifySeeds',
         'tests.mocks.components.FakeMiddlewareModifyResponse',
         'tests.mocks.components.FakeMiddlewareModifyLinks'
     ]
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     return FrontierManager.from_settings(settings)
Esempio n. 16
0
    def __init__(self, settings, strategy_class, strategy_args,
                 is_add_seeds_mode):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        scoring_log = mb.scoring_log()
        self.add_seeds_mode = is_add_seeds_mode
        if not self.add_seeds_mode:
            spider_log = mb.spider_log()
            self.consumer = spider_log.consumer(partition_id=partition_id,
                                                type=b'sw')
            self.consumer_batch_size = settings.get(
                'SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        self._decoder = decoder_cls(self._manager.request_model,
                                    self._manager.response_model)
        self._encoder = encoder_cls(self._manager.request_model)

        self.update_score = UpdateScoreStream(self.scoring_log_producer,
                                              self._encoder)
        self.states_context = StatesContext(self._manager.backend.states)
        self.consumer_batch_size = settings.get(
            'SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager,
                                                   strategy_args,
                                                   self.update_score,
                                                   self.states_context)
        self.states = self._manager.backend.states
        self.stats = {
            'consumed_since_start': 0,
            'consumed_add_seeds': 0,
            'consumed_page_crawled': 0,
            'consumed_links_extracted': 0,
            'consumed_request_error': 0,
            'dropped_links_extracted': 0,
        }
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        self._flush_states_task = LoopingCall(self.flush_states)
        self._flush_interval = settings.get("SW_FLUSH_INTERVAL")
        logger.info(
            "Strategy worker is initialized and consuming partition %d",
            partition_id)
Esempio n. 17
0
    def __init__(self, settings, no_batches, no_incoming, no_scoring):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None,
                                                       type=b'db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        self._encoder = encoder_cls(self._manager.request_model)
        self._decoder = decoder_cls(self._manager.request_model,
                                    self._manager.response_model)

        if isinstance(self._backend, DistributedBackend) and not no_scoring:
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_disabled = False
        else:
            self.strategy_disabled = True
        self.spider_log_consumer_batch_size = settings.get(
            'SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_consumer_batch_size = settings.get(
            'SCORING_LOG_CONSUMER_BATCH_SIZE')

        if settings.get('QUEUE_HOSTNAME_PARTITIONING'):
            self.logger.warning(
                'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.'
            )
            settings.set(
                'SPIDER_FEED_PARTITIONER',
                'frontera.contrib.backends.partitioners.Crc32NamePartitioner')
        self.partitioner_cls = load_object(
            settings.get('SPIDER_FEED_PARTITIONER'))
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring,
                         no_batches, self.strategy_disabled,
                         settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)
Esempio n. 18
0
    def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs):

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.message_bus = messagebus(settings)

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self.backend = self._manager.backend

        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path+".Encoder")
        decoder_cls = load_object(codec_path+".Decoder")
        self._encoder = encoder_cls(self._manager.request_model)
        self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model)

        slot_kwargs = {'no_batches': no_batches,
                       'no_incoming': no_incoming,
                       'no_scoring': no_scoring}
        slot_kwargs.update(**kwargs)
        self.slot = Slot(self, settings, **slot_kwargs)

        self.stats = defaultdict(int)
        self.job_id = 0
        self._logging_task = task.LoopingCall(self.log_status)
Esempio n. 19
0
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        self._kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = KeyedProducer(self._kafka,
                                       partitioner=Crc32NamePartitioner,
                                       codec=CODEC_SNAPPY)

        self._in_consumer = SimpleConsumer(self._kafka,
                                           settings.get('FRONTIER_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760)
        if not no_scoring:
            self._scoring_consumer = SimpleConsumer(
                self._kafka,
                settings.get('FRONTIER_GROUP'),
                settings.get('SCORING_TOPIC'),
                buffer_size=262144,
                max_buffer_size=1048576)

        self._offset_fetcher = Fetcher(self._kafka,
                                       settings.get('OUTGOING_TOPIC'),
                                       settings.get('FRONTIER_GROUP'))

        self._manager = FrontierManager.from_settings(settings)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('OUTGOING_TOPIC')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring, no_batches, no_scoring,
                         settings.get('NEW_BATCH_DELAY', 60.0), no_incoming)
        self.job_id = 0
        self.stats = {}
Esempio n. 20
0
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                       settings.get('SCORING_GROUP'),
                                       settings.get('INCOMING_TOPIC'),
                                       buffer_size=1048576,
                                       max_buffer_size=10485760,
                                       partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlingStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
Esempio n. 21
0
 def __init__(self, settings):
     self.manager = FrontierManager.from_settings(settings)
Esempio n. 22
0
 def setup_subject(partitions):
     settings = Settings(module='frontera.settings.default_settings')
     return RedisQueue(FrontierManager.from_settings(settings), get_pool(),
                       partitions, True)
Esempio n. 23
0
 def setup_subject(partitions):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('SPIDER_FEED_PARTITIONS', partitions)
     settings.set('REDIS_DROP_ALL_TABLES', True)
     return RedisBackend.db_worker(FrontierManager.from_settings(settings))
Esempio n. 24
0
 def __init__(self, settings):
     self.manager = FrontierManager.from_settings(settings)