def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, strategy_class
def main(): parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument( '--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument( '--hostname', type=str, help='Hostname or IP address to bind. Default is 127.0.0.1') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help= 'Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.') parser.add_argument( '--port', type=int, help= 'Base port number, server will bind to 6 ports starting from base. Default is 5550' ) args = parser.parse_args() settings = Settings(module=args.config) hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(hostname, port) server.logger.setLevel(args.log_level) server.start()
def main(): """ Parse arguments, set configuration values, then start the broker """ parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument( '--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument( '--address', type=str, help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1' '. When binding to wildcard it defaults to IPv4.') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is' ' INFO.') parser.add_argument( '--port', type=int, help='Base port number, server will bind to 6 ports starting from base' '. Default is 5550') args = parser.parse_args() settings = Settings(module=args.config) address = args.address if args.address else settings.get("ZMQ_ADDRESS") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(address, port) server.logger.setLevel(args.log_level) server.start()
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " "in a form of key=value separated with space") parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " "of strategy add_seeds method") parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) if args.port: settings.set('JSONRPC_PORT', args.port) strategy_args = {} if args.args: for arg in args.args: key, _, value = arg.partition("=") strategy_args[key] = value if value else None settings.set("STRATEGY_ARGS", strategy_args) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, args.add_seeds, args.seeds_url
def __init__(self, manager): self._manager = manager settings = Settings(attributes=manager.settings.attributes) messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = settings.get('SPIDER_PARTITION_ID') self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
def main(): parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument('--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument('--hostname', type=str, help='Hostname or IP address to bind. Default is 127.0.0.1') parser.add_argument('--log-level', '-L', type=str, default='INFO', help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.') parser.add_argument('--port', type=int, help='Base port number, server will bind to 6 ports starting from base. Default is 5550') args = parser.parse_args() settings = Settings(module=args.config) hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(hostname, port) server.logger.setLevel(args.log_level) server.start()
self.states.update_cache(request) if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else:
help='Settings module name, should be accessible by import') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get( 'CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError( "Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get( 'SCORING_PARTITION_ID') if partition_id >= settings.get( 'SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError( "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id)
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument( '--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " "in a form of key=value separated with space") parser.add_argument( '--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " "of strategy add_seeds method") parser.add_argument( '--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get( 'STRATEGY') if not strategy_classpath: raise ValueError( "Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get( 'SCORING_PARTITION_ID') if partition_id >= settings.get( 'SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError( "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) if args.port: settings.set('JSONRPC_PORT', args.port) strategy_args = {} if args.args: for arg in args.args: key, _, value = arg.partition("=") strategy_args[key] = value if value else None settings.set("STRATEGY_ARGS", strategy_args) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, args.add_seeds, args.seeds_url
parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") args = parser.parse_args() settings = Settings(module=args.config) if args.port: settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and os.path.exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = DBWorker(settings, args.no_batches, args.no_incoming, args.no_scoring, partitions=args.partitions) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run()
def run_add_seeds(settings, seeds_file): fh = open(seeds_file, "rb") logger.info("Starting local seeds addition from file %s", seeds_file) manager = LocalFrontierManager.from_settings(settings) manager.add_seeds(fh) manager.stop() manager.close() logger.info("Seeds addition finished") if __name__ == '__main__': parser = ArgumentParser(description="Frontera local add seeds utility") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path") args = parser.parse_args() settings = Settings(module=args.config) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) run_add_seeds(settings, args.seeds_file)
if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument( '--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') args = parser.parse_args() logger.setLevel(args.log_level) logger.addHandler(CONSOLE) settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get( 'CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError( "Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) worker = StrategyWorker(settings, strategy_class) worker.run()
def test_fallsback_to_frontera_default_settings(): settings = Settings() assert settings.get('MAX_NEXT_REQUESTS') == 64
def test_settings_on_a_python_module_are_loaded(): settings = Settings('tests.scrapy_spider.frontera.settings') assert settings.get('MAX_REQUESTS') == 5
self.strategy.page_error(request, error) self.states.update_cache(request) if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = StrategyWorker(settings, strategy_class) worker.run()
def test_settings_passed_as_attributes_can_be_found(): settings = Settings(attributes={'SETTING': 'value'}) assert settings.get('SETTING') == 'value'
type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get( 'CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError( "Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = StrategyWorker(settings, strategy_class) worker.run()
self.states.update_cache(links) self.states.update_cache(response) def on_request_error(self, request, error): logger.debug("Page error %s (%s)", request.url, error) self.states.set_states(request) self.strategy.page_error(request, error) self.states.update_cache(request) if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') args = parser.parse_args() logger.setLevel(args.log_level) logger.addHandler(CONSOLE) settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) worker = StrategyWorker(settings, strategy_class) worker.run()
def test_settings_on_a_python_module_are_loaded(): settings = Settings('frontera.tests.scrapy_spider.frontera.settings') assert settings.get('MAX_REQUESTS') == 5
def test_fallsback_to_frontera_default_settings(): settings = Settings() assert settings.get('MAX_NEXT_REQUESTS') == 0