Example #1
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     decoder_cls = load_object(codec_path + ".Decoder")
     store_content = settings.get('STORE_CONTENT')
     self._encoder = encoder_cls(manager.request_model,
                                 send_body=store_content)
     self._decoder = decoder_cls(manager.request_model,
                                 manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(
         self._get_next_requests,
         max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
         keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
         max_keys=settings.get('OVERUSED_MAX_KEYS'),
         keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
     self._logger.info("Consuming from partition id %d", self.partition_id)
Example #2
0
 def test_name_error(self):
     with pytest.raises(NameError) as info:
         load_object(
             'frontera.tests.mocks.load_objects.non_existent_object')
     assert info.value.message == (
         "Module 'frontera.tests.mocks.load_objects' doesn't define"
         " any object named 'non_existent_object'")
Example #3
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path+".Encoder")
        decoder_cls = load_object(codec_path+".Decoder")
        self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model)
        self._encoder = encoder_cls(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {
            'consumed_since_start': 0
        }
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        self._flush_states_task = LoopingCall(self.flush_states)
        logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
Example #4
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path+".Encoder")
        decoder_cls = load_object(codec_path+".Decoder")
        self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model)
        self._encoder = encoder_cls(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {
            'consumed_since_start': 0
        }
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        self._flush_states_task = LoopingCall(self.flush_states)
        logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
Example #5
0
    def __init__(self, settings, no_batches, no_incoming, no_scoring,
                 **kwargs):

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.message_bus = messagebus(settings)

        self._manager = WorkerFrontierManager.from_settings(settings,
                                                            db_worker=True)
        self.backend = self._manager.backend

        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        self._encoder = encoder_cls(self._manager.request_model)
        self._decoder = decoder_cls(self._manager.request_model,
                                    self._manager.response_model)

        slot_kwargs = {
            'no_batches': no_batches,
            'no_incoming': no_incoming,
            'no_scoring': no_scoring
        }
        slot_kwargs.update(**kwargs)
        self.slot = Slot(self, settings, **slot_kwargs)

        self.stats = defaultdict(int)
        self.job_id = 0
        self._logging_task = task.LoopingCall(self.log_status)
Example #6
0
    def __init__(self, settings):
        self.topic_todo = settings.get('SPIDER_FEED_TOPIC')
        self.topic_done = settings.get('SPIDER_LOG_TOPIC')
        self.topic_scoring = settings.get('SCORING_LOG_TOPIC')

        self.spiderlog_dbw_group = settings.get('SPIDER_LOG_DBW_GROUP')
        self.spiderlog_sw_group = settings.get('SPIDER_LOG_SW_GROUP')
        self.scoringlog_dbw_group = settings.get('SCORING_LOG_DBW_GROUP')
        self.spider_feed_group = settings.get('SPIDER_FEED_GROUP')
        self.spider_partition_id = settings.get('SPIDER_PARTITION_ID')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.codec = settings.get('KAFKA_CODEC')
        self.kafka_location = settings.get('KAFKA_LOCATION')

        if settings.get('QUEUE_HOSTNAME_PARTITIONING'):
            logger.warning(
                'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.'
            )
            settings.set(
                'SPIDER_FEED_PARTITIONER',
                'frontera.contrib.backends.partitioners.Crc32NamePartitioner')

        spider_log_partitions = list(
            range(settings.get('SPIDER_LOG_PARTITIONS')))
        spider_log_partitioner_cls = load_object(
            settings.get('SPIDER_LOG_PARTITIONER'))
        self.spider_log_partitioner = spider_log_partitioner_cls(
            spider_log_partitions)

        spider_feed_partitions = list(
            range(settings.get('SPIDER_FEED_PARTITIONS')))
        spider_feed_partitioner_cls = load_object(
            settings.get('SPIDER_FEED_PARTITIONER'))
        self.spider_feed_partitioner = spider_feed_partitioner_cls(
            spider_feed_partitions)
Example #7
0
 def test_import_error(self):
     with pytest.raises(ImportError) as info:
         load_object('frontera.non_existent_module.object')
     if six.PY2:
         assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'"
                                    ": No module named non_existent_module")
     else:
         assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'"
                                    ": No module named 'frontera.non_existent_module'")
Example #8
0
 def test_import_error(self):
     with pytest.raises(ImportError) as info:
         load_object('frontera.non_existent_module.object')
     if six.PY2:
         assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'"
                                    ": No module named non_existent_module")
     else:
         assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'"
                                    ": No module named 'frontera.non_existent_module'")
Example #9
0
    def __init__(self, settings, no_batches, no_incoming, no_scoring):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None,
                                                       type=b'db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        self._encoder = encoder_cls(self._manager.request_model)
        self._decoder = decoder_cls(self._manager.request_model,
                                    self._manager.response_model)

        if isinstance(self._backend, DistributedBackend) and not no_scoring:
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_disabled = False
        else:
            self.strategy_disabled = True
        self.spider_log_consumer_batch_size = settings.get(
            'SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_consumer_batch_size = settings.get(
            'SCORING_LOG_CONSUMER_BATCH_SIZE')

        if settings.get('QUEUE_HOSTNAME_PARTITIONING'):
            self.logger.warning(
                'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.'
            )
            settings.set(
                'SPIDER_FEED_PARTITIONER',
                'frontera.contrib.backends.partitioners.Crc32NamePartitioner')
        self.partitioner_cls = load_object(
            settings.get('SPIDER_FEED_PARTITIONER'))
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring,
                         no_batches, self.strategy_disabled,
                         settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)
Example #10
0
 def _start_logger(self, klass, name, level, enabled, handlers):
     logger = klass(name=name, level=level, enabled=enabled)
     for handler in handlers:
         if isinstance(handler, six.string_types):
             handler = load_object(handler)
         logger.add_handler(handler)
     return logger
Example #11
0
    def __init__(self, settings, strategy_module):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_module.CrawlingStrategy()
        self.states = self._manager.backend.states
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
        self.task = LoopingCall(self.work)
Example #12
0
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {}
        self.job_id = 0
        self.task = LoopingCall(self.work)
Example #13
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Example #14
0
    def __init__(self, manager, pool, partitions, delete_all_keys=False):
        settings = manager.settings
        codec_path = settings.get('REDIS_BACKEND_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")
        self._encoder = encoder_cls(manager.request_model)
        self._decoder = decoder_cls(manager.request_model,
                                    manager.response_model)
        self._redis = RedisOperation(pool)
        self._redis_pipeline = RedisPipeline(pool)
        self._partitions = [i for i in range(0, partitions)]
        self._partitioner = Crc32NamePartitioner(self._partitions)
        self._logger = logging.getLogger("redis_backend.queue")

        if delete_all_keys:
            self._redis.flushdb()
Example #15
0
    def __init__(self, manager):
        self.manager = manager

        # Get settings
        settings = manager.settings
        engine = settings.get('SQLALCHEMYBACKEND_ENGINE', DEFAULT_ENGINE)
        engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO',
                                   DEFAULT_ENGINE_ECHO)
        drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES',
                                       DEFAULT_DROP_ALL_TABLES)
        clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT',
                                     DEFAULT_CLEAR_CONTENT)
        models = settings.get('SQLALCHEMYBACKEND_MODELS', DEFAULT_MODELS)

        # Create engine
        self.engine = create_engine(engine, echo=engine_echo)

        # Load models
        self.models = dict([(name, load_object(klass))
                            for name, klass in models.items()])

        # Drop tables if we have to
        if drop_all_tables:
            Base.metadata.drop_all(self.engine)
        Base.metadata.create_all(self.engine)

        # Create session
        self.Session = sessionmaker()
        self.Session.configure(bind=self.engine)
        self.session = self.Session()

        # Clear content if we have to
        if clear_content:
            for name, table in Base.metadata.tables.items():
                self.session.execute(table.delete())
Example #16
0
    def __init__(self, manager):
        self.manager = manager
        settings = manager.settings
        engine = settings.get('SQLALCHEMYBACKEND_ENGINE')
        engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO')
        drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES')
        clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT')
        models = settings.get('SQLALCHEMYBACKEND_MODELS')

        self.engine = create_engine(engine, echo=engine_echo)
        self.models = dict([(name, load_object(klass)) for name, klass in models.items()])

        if drop_all_tables:
            DeclarativeBase.metadata.drop_all(self.engine)
        DeclarativeBase.metadata.create_all(self.engine)

        self.session_cls = sessionmaker()
        self.session_cls.configure(bind=self.engine)

        if clear_content:
            session = self.session_cls()
            for name, table in DeclarativeBase.metadata.tables.items():
                session.execute(table.delete())
            session.close()
        self._metadata = Metadata(self.session_cls, self.models['MetadataModel'],
                                  settings.get('SQLALCHEMYBACKEND_CACHE_SIZE'))
        self._states = States(self.session_cls, self.models['StateModel'],
                              settings.get('STATE_CACHE_SIZE_LIMIT'))
        self._queue = self._create_queue(settings)
Example #17
0
    def __init__(self, manager):
        self.manager = manager
        settings = manager.settings
        engine = settings.get('SQLALCHEMYBACKEND_ENGINE')
        engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO')
        drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES')
        clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT')
        models = settings.get('SQLALCHEMYBACKEND_MODELS')

        self.engine = create_engine(engine, echo=engine_echo)
        self.models = dict([(name, load_object(klass))
                            for name, klass in models.items()])

        if drop_all_tables:
            DeclarativeBase.metadata.drop_all(self.engine)
        DeclarativeBase.metadata.create_all(self.engine)

        self.session_cls = sessionmaker()
        self.session_cls.configure(bind=self.engine)

        if clear_content:
            session = self.session_cls()
            for name, table in DeclarativeBase.metadata.tables.items():
                session.execute(table.delete())
            session.close()
        self._metadata = Metadata(self.session_cls,
                                  self.models['MetadataModel'],
                                  settings.get('SQLALCHEMYBACKEND_CACHE_SIZE'))
        self._states = States(self.session_cls, self.models['StateModel'],
                              settings.get('STATE_CACHE_SIZE_LIMIT'))
        self._queue = self._create_queue(settings)
Example #18
0
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None,
                                                       type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get(
            'QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming,
                         self.consume_scoring,
                         no_batches, self.strategy_enabled,
                         settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {}
Example #19
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Example #20
0
 def _load_object(self, obj_class_name, silent=False):
     obj_class = load_object(obj_class_name)
     try:
         return self._load_frontier_object(obj_class)
     except NotConfigured:
         if not silent:
             raise NotConfigured
Example #21
0
    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        partitions = list(range(settings.get('SPIDER_FEED_PARTITIONS')))
        partitioner_cls = load_object(settings.get('SPIDER_FEED_PARTITIONER'))
        self.partitioner = partitioner_cls(partitions)
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':'
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({'protocol': 'compact', 'transport': 'framed'})
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None
Example #22
0
 def _load_object(self, obj_class_name, silent=False):
     obj_class = load_object(obj_class_name)
     try:
         return self._load_frontier_object(obj_class)
     except NotConfigured:
         if not silent:
             raise NotConfigured
Example #23
0
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)
Example #24
0
    def __init__(self, manager):
        self.manager = manager

        # Get settings
        settings = manager.settings
        engine = settings.get('SQLALCHEMYBACKEND_ENGINE', DEFAULT_ENGINE)
        engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO', DEFAULT_ENGINE_ECHO)
        drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES', DEFAULT_DROP_ALL_TABLES)
        clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT', DEFAULT_CLEAR_CONTENT)
        models = settings.get('SQLALCHEMYBACKEND_MODELS', DEFAULT_MODELS)

        # Create engine
        self.engine = create_engine(engine, echo=engine_echo)

        # Load models
        self.models = dict([(name, load_object(klass)) for name, klass in models.items()])

        # Drop tables if we have to
        if drop_all_tables:
            Base.metadata.drop_all(self.engine)
        Base.metadata.create_all(self.engine)

        # Create session
        self.Session = sessionmaker()
        self.Session.configure(bind=self.engine)
        self.session = self.Session()

        # Clear content if we have to
        if clear_content:
            for name, table in Base.metadata.tables.items():
                self.session.execute(table.delete())
Example #25
0
    def __init__(self, manager):
        self.manager = manager
        settings = manager.settings
        cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS')      # Format: ['192.168.0.1', '192.168.0.2']
        cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT')
        keyspace = settings.get('CASSANDRABACKEND_KEYSPACE')
        keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS')                # Default: true
        models = settings.get('CASSANDRABACKEND_MODELS')

        self.cluster = Cluster(cluster_ips, cluster_port)
        self.models = dict([(name, load_object(klass)) for name, klass in models.items()])

        self.session = self.cluster.connect()
        self.session.row_factory = dict_factory

        if keyspace_create:
            query = """CREATE KEYSPACE IF NOT EXISTS \"%s\"
                        WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, )
            self.session.execute(query)
        self.session.set_keyspace(keyspace)
        connection.set_session(self.session)

        self._metadata = None
        self._queue = None
        self._states = None
Example #26
0
 def __init__(self, crawler):
     settings = ScrapySettingsAdapter(crawler.settings)
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     # XXX this can be improved later by reusing spider's producer
     # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer)
     # but the topic is hard-coded in the current scheme, so it requires some
     # preliminary changes in Frontera itself.
     message_bus = load_object(settings.get('MESSAGE_BUS'))(settings)
     stats_log = message_bus.stats_log()
     if not stats_log:
         raise NotConfigured
     self.stats_producer = stats_log.producer()
     self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     self._stats_encoder = encoder_cls(request_model=None)  # no need to encode requests
     self._export_stats_task = None
Example #27
0
 def __init__(self, crawler):
     settings = ScrapySettingsAdapter(crawler.settings)
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     # XXX this can be improved later by reusing spider's producer
     # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer)
     # but the topic is hard-coded in the current scheme, so it requires some
     # preliminary changes in Frontera itself.
     message_bus = load_object(settings.get('MESSAGE_BUS'))(settings)
     stats_log = message_bus.stats_log()
     if not stats_log:
         raise NotConfigured
     self.stats_producer = stats_log.producer()
     self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     self._stats_encoder = encoder_cls(
         request_model=None)  # no need to encode requests
     self._export_stats_task = None
Example #28
0
 def __init__(self, strategy_class, strategy_args, scoring_stream):
     self._scoring_stream = scoring_stream if scoring_stream else LocalUpdateScoreStream(
         self.backend.queue)
     self._states_context = StatesContext(self.backend.states)
     if isinstance(strategy_class, str):
         strategy_class = load_object(strategy_class)
     self._strategy = strategy_class.from_worker(self, strategy_args,
                                                 self._scoring_stream,
                                                 self._states_context)
Example #29
0
    def __init__(self, settings, is_add_seeds_mode):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        scoring_log = mb.scoring_log()
        self.add_seeds_mode = is_add_seeds_mode
        if not self.add_seeds_mode:
            spider_log = mb.spider_log()
            self.consumer = spider_log.consumer(partition_id=partition_id,
                                                type=b'sw')
            self.consumer_batch_size = settings.get(
                'SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_producer = scoring_log.producer()

        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")

        request_model = load_object(settings.get('REQUEST_MODEL'))
        response_model = load_object(settings.get('RESPONSE_MODEL'))
        self._decoder = decoder_cls(request_model, response_model)
        self._encoder = encoder_cls(request_model)

        self.update_score = MessageBusUpdateScoreStream(
            self.scoring_log_producer, self._encoder)
        manager = WorkerFrontierManager.from_settings(
            settings, strategy_worker=True, scoring_stream=self.update_score)

        self.consumer_batch_size = settings.get(
            'SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.stats = defaultdict(int)
        self.backend = manager.backend
        self.workflow = BatchedWorkflow(manager, self.update_score, self.stats,
                                        0)
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        self._flush_states_task = LoopingCall(self.flush_states)
        self._flush_interval = settings.get("SW_FLUSH_INTERVAL")
        logger.info(
            "Strategy worker is initialized and consuming partition %d",
            partition_id)
Example #30
0
 def __init__(self, manager):
     self.manager = manager
     settings = manager.settings
     self._metadata = MemoryMetadata()
     self._states = MemoryStates(settings.get("STATE_CACHE_SIZE"))
     partitions = list(range(settings.get('SPIDER_FEED_PARTITIONS')))
     partitioner_cls = load_object(settings.get('SPIDER_FEED_PARTITIONER'))
     self._partitioner = partitioner_cls(partitions)
     self._queue = self._create_queue(settings)
     self._id = 0
Example #31
0
 def __init__(self, settings, *args, **kwargs):
     super(StatsExportMixin, self).__init__(settings, *args, **kwargs)
     message_bus = load_object(settings.get('MESSAGE_BUS'))(settings)
     stats_log = message_bus.stats_log()
     # FIXME can be removed after implementing stats_log for ZeroMQ bus
     if not stats_log:
         return
     self.stats_producer = stats_log.producer()
     self._stats_tags = self.get_stats_tags(settings, *args, **kwargs)
     self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60)
     self._export_stats_task = LoopingCall(self.export_stats)
Example #32
0
 def __init__(self, settings, *args, **kwargs):
     super(StatsExportMixin, self).__init__(settings, *args, **kwargs)
     message_bus = load_object(settings.get('MESSAGE_BUS'))(settings)
     stats_log = message_bus.stats_log()
     # FIXME can be removed after implementing stats_log for ZeroMQ bus
     if not stats_log:
         return
     self.stats_producer = stats_log.producer()
     self._stats_tags = self.get_stats_tags(settings, *args, **kwargs)
     self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60)
     self._export_stats_task = LoopingCall(self.export_stats)
Example #33
0
 def __init__(self, settings=None):
     self.settings = settings or Settings()
     self.stats = load_object(self.settings['STATS_CLASS'])(self)
     dummy_class = type('class', (object,), {})
     downloader = dummy_class()
     downloader.slots = {}
     downloader.domain_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_DOMAIN')
     downloader.ip_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_IP')
     self.engine = dummy_class()
     self.engine.downloader = downloader
     self.engine.downloader.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
Example #34
0
 def __init__(self, settings=None):
     self.settings = settings or Settings()
     self.stats = load_object(self.settings['STATS_CLASS'])(self)
     dummy_class = type('class', (object,), {})
     downloader = dummy_class()
     downloader.slots = {}
     downloader.domain_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_DOMAIN')
     downloader.ip_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_IP')
     self.engine = dummy_class()
     self.engine.downloader = downloader
     self.engine.downloader.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
Example #35
0
    def __init__(self, request_model, response_model, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = logging.getLogger("manager")

        # Log frontier manager starting
        self._logger.info('-' * 80)
        self._logger.info('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Example #36
0
    def __init__(self, request_model, response_model, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = logging.getLogger("manager")

        # Log frontier manager starting
        self._logger.info('-'*80)
        self._logger.info('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Example #37
0
 def __init__(self, manager):
     self.manager = manager
     settings = manager.settings
     engine = settings.get('SQLALCHEMYBACKEND_ENGINE')
     engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO')
     models = settings.get('SQLALCHEMYBACKEND_MODELS')
     self.engine = create_engine(engine, echo=engine_echo)
     self.models = dict([(name, load_object(klass)) for name, klass in models.items()])
     self.session_cls = sessionmaker()
     self.session_cls.configure(bind=self.engine)
     self._metadata = None
     self._queue = None
     self._states = None
Example #38
0
    def __init__(self, settings):
        self.logger = getLogger("messagebus.zeromq")
        self.context = Context()
        self.socket_config = SocketConfig(settings.get('ZMQ_ADDRESS'),
                                          settings.get('ZMQ_BASE_PORT'))
        self.spider_partition = settings.get('SPIDER_PARTITION_ID')

        if settings.get('QUEUE_HOSTNAME_PARTITIONING'):
            self.logger.warning(
                'QUEUE_HOSTNAME_PARTITIONING is deprecated, use SPIDER_FEED_PARTITIONER instead.'
            )
            settings.set(
                'SPIDER_FEED_PARTITIONER',
                'frontera.contrib.backends.partitioners.Crc32NamePartitioner')

        self.spider_log_partitions = [
            i for i in range(settings.get('SPIDER_LOG_PARTITIONS'))
        ]
        spider_log_partitioner_cls = load_object(
            settings.get('SPIDER_LOG_PARTITIONER'))
        self.spider_log_partitioner = spider_log_partitioner_cls(
            self.spider_log_partitions)

        self.spider_feed_partitions = [
            i for i in range(settings.get('SPIDER_FEED_PARTITIONS'))
        ]
        spider_feed_partitioner_cls = load_object(
            settings.get('SPIDER_FEED_PARTITIONER'))
        self.spider_feed_partitioner = spider_feed_partitioner_cls(
            self.spider_feed_partitions)

        self.spider_feed_sndhwm = int(
            settings.get('MAX_NEXT_REQUESTS') *
            len(self.spider_feed_partitions) * 1.2)
        self.spider_feed_rcvhwm = int(settings.get('MAX_NEXT_REQUESTS') * 2.0)
        self.max_next_requests = int(settings.get('MAX_NEXT_REQUESTS'))
        if self.socket_config.is_ipv6:
            self.context.zeromq.setsockopt(zmq.IPV6, True)
Example #39
0
    def __init__(self, request_model, response_model, logger, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = load_object(logger)(self._settings)
        assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \
                                                         self._logger.__class__.__name__

        # Log frontier manager starting
        self.logger.manager.debug('-' * 80)
        self.logger.manager.debug('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Example #40
0
    def __init__(self, request_model, response_model, logger, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = load_object(logger)(self._settings)
        assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \
                                                         self._logger.__class__.__name__

        # Log frontier manager starting
        self.logger.manager.debug('-'*80)
        self.logger.manager.debug('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Example #41
0
 def __init__(self, manager):
     self.manager = manager
     settings = manager.settings
     engine = settings.get('SQLALCHEMYBACKEND_ENGINE')
     engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO')
     models = settings.get('SQLALCHEMYBACKEND_MODELS')
     self.engine = create_engine(engine, echo=engine_echo)
     self.models = dict([(name, load_object(klass))
                         for name, klass in models.items()])
     self.session_cls = sessionmaker()
     self.session_cls.configure(bind=self.engine)
     self._metadata = None
     self._queue = None
     self._states = None
Example #42
0
    def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs):

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.message_bus = messagebus(settings)

        self._manager = WorkerFrontierManager.from_settings(settings, db_worker=True)
        self.backend = self._manager.backend

        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path+".Encoder")
        decoder_cls = load_object(codec_path+".Decoder")
        self._encoder = encoder_cls(self._manager.request_model)
        self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model)

        slot_kwargs = {'no_batches': no_batches,
                       'no_incoming': no_incoming,
                       'no_scoring': no_scoring}
        slot_kwargs.update(**kwargs)
        self.slot = Slot(self, settings, **slot_kwargs)

        self.stats = defaultdict(int)
        self.job_id = 0
        self._logging_task = task.LoopingCall(self.log_status)
Example #43
0
    def __init__(self, manager):
        self.manager = manager
        settings = manager.settings
        cluster_ips = settings.get('CASSANDRABACKEND_CLUSTER_IPS')
        cluster_port = settings.get('CASSANDRABACKEND_CLUSTER_PORT')
        drop_all_tables = settings.get('CASSANDRABACKEND_DROP_ALL_TABLES')
        keyspace = settings.get('CASSANDRABACKEND_KEYSPACE')
        keyspace_create = settings.get('CASSANDRABACKEND_CREATE_KEYSPACE_IF_NOT_EXISTS')
        models = settings.get('CASSANDRABACKEND_MODELS')
        crawl_id = settings.get('CASSANDRABACKEND_CRAWL_ID')
        generate_stats = settings.get('CASSANDRABACKEND_GENERATE_STATS')

        self.models = dict([(name, load_object(klass)) for name, klass in models.items()])

        self.cluster = Cluster(
            contact_points=cluster_ips,
            port=cluster_port,
            compression=True,
            default_retry_policy=RetryPolicy(),
            reconnection_policy=ConstantReconnectionPolicy(10, 100)
        )

        self.session = self.cluster.connect()
        self.session.row_factory = dict_factory
        self.session.encoder.mapping[dict] = self.session.encoder.cql_encode_map_collection
        self.crawl_id = crawl_id
        self.generate_stats = generate_stats

        if keyspace_create:
            query = """CREATE KEYSPACE IF NOT EXISTS \"%s\"
                        WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}""" % (keyspace, )
            self.session.execute(query)

        self.session.set_keyspace(keyspace)

        connection.set_session(self.session)

        if drop_all_tables:
            for key, value in self.models.iteritems():
                drop_table(value)

        for key, value in self.models.iteritems():
            if (self.generate_stats is False and key != 'CrawlStatsModel') or self.generate_stats is True:
                sync_table(value)

        self._metadata = Metadata(self.session, self.models['MetadataModel'], self.crawl_id, self.generate_stats)
        self._states = States(self.session, self.models['StateModel'],
                              settings.get('STATE_CACHE_SIZE_LIMIT'), self.crawl_id)
        self._queue = self._create_queue(settings)
Example #44
0
 def __init__(self, manager):
     self._manager = manager
     settings = self._manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Example #45
0
 def __init__(self, manager):
     self._manager = manager
     settings = self._manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Example #46
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path+".Encoder")
     decoder_cls = load_object(codec_path+".Decoder")
     store_content = settings.get('STORE_CONTENT')
     self._encoder = encoder_cls(manager.request_model, send_body=store_content)
     self._decoder = decoder_cls(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'):
         raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.")
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   max_per_key=settings.get('OVERUSED_MAX_PER_KEY'),
                                   keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"),
                                   max_keys=settings.get('OVERUSED_MAX_KEYS'),
                                   keep_keys=settings.get('OVERUSED_KEEP_KEYS'))
     self._logger.info("Consuming from partition id %d", self.partition_id)
Example #47
0
    def __init__(self, settings, is_add_seeds_mode):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        scoring_log = mb.scoring_log()
        self.add_seeds_mode = is_add_seeds_mode
        if not self.add_seeds_mode:
            spider_log = mb.spider_log()
            self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw')
            self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_producer = scoring_log.producer()

        codec_path = settings.get('MESSAGE_BUS_CODEC')
        encoder_cls = load_object(codec_path + ".Encoder")
        decoder_cls = load_object(codec_path + ".Decoder")

        request_model = load_object(settings.get('REQUEST_MODEL'))
        response_model = load_object(settings.get('RESPONSE_MODEL'))
        self._decoder = decoder_cls(request_model, response_model)
        self._encoder = encoder_cls(request_model)

        self.update_score = MessageBusUpdateScoreStream(self.scoring_log_producer, self._encoder)
        manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True, scoring_stream=self.update_score)

        self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.stats = defaultdict(int)
        self.backend = manager.backend
        self.workflow = BatchedWorkflow(manager, self.update_score, self.stats, 0)
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        self._flush_states_task = LoopingCall(self.flush_states)
        self._flush_interval = settings.get("SW_FLUSH_INTERVAL")
        logger.info("Strategy worker is initialized and consuming partition %d", partition_id)
Example #48
0
 def _load_backend(self, backend, db_worker, strategy_worker):
     cls = load_object(backend)
     assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__
     if issubclass(cls, DistributedBackend):
         if db_worker:
             return cls.db_worker(self)
         if strategy_worker:
             return cls.strategy_worker(self)
         raise RuntimeError("Distributed backends are meant to be used in workers.")
     else:
         assert not strategy_worker, "In order to distribute backend only DistributedBackend " \
                                     "subclasses are allowed to use."
     if hasattr(cls, 'from_manager'):
         return cls.from_manager(self)
     else:
         return cls()
Example #49
0
 def _load_backend(self, backend, db_worker, strategy_worker):
     cls = load_object(backend)
     assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__
     if issubclass(cls, DistributedBackend):
         if db_worker:
             return cls.db_worker(self)
         if strategy_worker:
             return cls.strategy_worker(self)
         raise RuntimeError("Distributed backends are meant to be used in workers.")
     else:
         assert not strategy_worker, "In order to distribute backend only DistributedBackend " \
                                     "subclasses are allowed to use."
     if hasattr(cls, 'from_manager'):
         return cls.from_manager(self)
     else:
         return cls()
Example #50
0
 def _load_backend(self, backend, db_worker, strategy_worker):
     # FIXME remove obsolete
     cls = load_object(backend)
     assert issubclass(
         cls, Backend), "backend '%s' must subclass Backend" % cls.__name__
     if issubclass(cls, DistributedBackend):
         if db_worker:
             return cls.db_worker(self)
         if strategy_worker:
             return cls.strategy_worker(self)
         return cls.local(self)
     else:
         assert not strategy_worker, "In order to distribute backend only DistributedBackend " \
                                     "subclasses are allowed to use"
     if hasattr(cls, 'from_manager'):
         return cls.from_manager(self)
     else:
         return cls()
Example #51
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'):
         raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.")
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   self._logger.debug)
     self._logger.info("Consuming from partition id %d", self.partition_id)
Example #52
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   self._logger.debug)
     self._logger.info("Consuming from partition id %d", self.partition_id)
Example #53
0
 def test_load_variable(self):
     obj = load_object('tests.mocks.load_objects.mock_variable')
     assert obj == 'test'
Example #54
0
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    worker = StrategyWorker(settings, strategy_class)
Example #55
0
 def test_load_class(self):
     obj = load_object('tests.mocks.load_objects.MockClass')
     assert obj.val == 10
Example #56
0
 def test_load_instance(self):
     obj = load_object('tests.mocks.load_objects.mock_instance')
     assert obj.val == 5
Example #57
0
 def test_load_function(self):
     obj = load_object('tests.mocks.load_objects.mock_function')
     assert obj() == 2
Example #58
0
 def test_value_error(self):
     with pytest.raises(ValueError) as info:
         load_object('frontera')
     assert str(info.value) == "Error loading object 'frontera': not a full path"
Example #59
0
 def test_name_error(self):
     with pytest.raises(NameError) as info:
         load_object('tests.mocks.load_objects.non_existent_object')
     assert str(info.value) == ("Module 'tests.mocks.load_objects' doesn't define"
                                " any object named 'non_existent_object'")