def __init__(self): settings = Settings() settings.set("SPIDER_FEED_PARTITIONS", 1) settings.set("QUEUE_HOSTNAME_PARTITIONING", True) self.mb = MessageBus(settings) sl = self.mb.spider_log() # sw self.sw_sl_c = sl.consumer(partition_id=0, type="sw") us = self.mb.scoring_log() self.sw_us_p = us.producer() sleep(0.1) # db self.db_sl_c = sl.consumer(partition_id=None, type="db") self.db_us_c = us.consumer() sf = self.mb.spider_feed() self.db_sf_p = sf.producer() sleep(0.1) # spider self.sp_sl_p = sl.producer() self.sp_sf_c = sf.consumer(0) sleep(0.1)
def __init__(self): settings = Settings() settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.mb = MessageBus(settings) sl = self.mb.spider_log() # sw self.sw_sl_c = sl.consumer(partition_id=0, type='sw') us = self.mb.scoring_log() self.sw_us_p = us.producer() sleep(0.1) # db self.db_sl_c = sl.consumer(partition_id=None, type='db') self.db_us_c = us.consumer() sf = self.mb.spider_feed() self.db_sf_p = sf.producer() sleep(0.1) # spider self.sp_sl_p = sl.producer() self.sp_sf_c = sf.consumer(0) sleep(0.1)
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, strategy_class
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, strategy_class
def test_close_manager(self): settings = Settings(module='frontera.settings.default_settings') settings.set('BACKEND', 'frontera.contrib.backends.redis_backend.RedisBackend') manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) self.assertEqual(RedisBackend, manager.backend.__class__) manager.close()
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " "in a form of key=value separated with space") parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " "of strategy add_seeds method") parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) if args.port: settings.set('JSONRPC_PORT', args.port) strategy_args = {} if args.args: for arg in args.args: key, _, value = arg.partition("=") strategy_args[key] = value if value else None settings.set("STRATEGY_ARGS", strategy_args) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, args.add_seeds, args.seeds_url
def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") logger.setLevel(logging.INFO) logger.addHandler(handler) self.logger = logging.getLogger("tester") self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") client.close() settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = KafkaMessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = KafkaConsumerPolling( spiderlog.consumer(partition_id=0, type=b'sw')) scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() # db self.db_sl_c = KafkaConsumerPolling( spiderlog.consumer(partition_id=None, type=b'db')) self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling( spider_feed.consumer(partition_id=0)) self.logger.debug("init is done")
def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") logger.setLevel(logging.INFO) logger.addHandler(handler) self.logger = logging.getLogger("tester") self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") client.close() settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = KafkaMessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw')) scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() # db self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db')) self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) self.logger.debug("init is done")
def test_kafka_message_bus_integration(): kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") logging.basicConfig(level=logging.INFO) #kafkabus = logging.getLogger("kafkabus") #kafkabus.addHandler(logging.StreamHandler()) settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('FRONTIER_GROUP', 'frontier2') settings.set('SCORING_TOPIC', "frontier-score") tester = MessageBusTester(KafkaMessageBus, settings) tester.spider_log_activity(64) assert tester.sw_activity() == 64 assert tester.db_activity(128) == (64, 32) assert tester.spider_feed_activity() == 128
def __init__(self): settings = Settings() settings.set('ZMQ_ADDRESS', '::1') super(IPv6MessageBusTester, self).__init__(settings)
parser.add_argument('--no-incoming', action='store_true', help='Disables spider log processing.') parser.add_argument('--no-scoring', action='store_true', help='Disables scoring log processing.') parser.add_argument('--partitions', type=int, nargs='*', help='Optional partitions range for batch generator') parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import.') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") args = parser.parse_args() settings = Settings(module=args.config) if args.port: settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and os.path.exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = DBWorker(settings, args.no_batches, args.no_incoming, args.no_scoring, partitions=args.partitions) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run()
def __init__(self): settings = Settings() settings.set('ZMQ_ADDRESS', '::1') super(IPv6MessageBusTester, self).__init__(settings)
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument( '--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " "in a form of key=value separated with space") parser.add_argument( '--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " "of strategy add_seeds method") parser.add_argument( '--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get( 'STRATEGY') if not strategy_classpath: raise ValueError( "Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get( 'SCORING_PARTITION_ID') if partition_id >= settings.get( 'SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError( "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) if args.port: settings.set('JSONRPC_PORT', args.port) strategy_args = {} if args.args: for arg in args.args: key, _, value = arg.partition("=") strategy_args[key] = value if value else None settings.set("STRATEGY_ARGS", strategy_args) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, args.add_seeds, args.seeds_url
def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') settings.set('SPIDER_FEED_PARTITIONS', partitions) settings.set('REDIS_DROP_ALL_TABLES', True) return RedisBackend.db_worker(WorkerFrontierManager.from_settings(settings, db_worker=True))
def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') settings.set('SPIDER_FEED_PARTITIONS', partitions) settings.set('REDIS_DROP_ALL_TABLES', True) return RedisBackend.db_worker(WorkerFrontierManager.from_settings(settings, db_worker=True))
required=True, help='Settings module name, should be accessible by import.') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") args = parser.parse_args() settings = Settings(module=args.config) if args.port: settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and os.path.exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = DBWorker(settings, args.no_batches, args.no_incoming, args.no_scoring, partitions=args.partitions) server = WorkerJsonRpcService(worker, settings)
help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = StrategyWorker(settings, strategy_class) worker.run()
type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get( 'CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError( "Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get( 'SCORING_PARTITION_ID') if partition_id >= settings.get( 'SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError( "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = StrategyWorker(settings, strategy_class) worker.run()
def test_close_manager(self): settings = Settings(module='frontera.settings.default_settings') settings.set('BACKEND', 'frontera.contrib.backends.redis_backend.RedisBackend') manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) self.assertEqual(RedisBackend, manager.backend.__class__) manager.close()