def handle_distributed_coach_orchestrator(graph_manager, args): ckpt_inside_container = "/checkpoint" rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:] trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:] if '--experiment_name' not in rollout_command: rollout_command = rollout_command + ['--experiment_name', args.experiment_name] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + ['--experiment_name', args.experiment_name] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return try: orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy()
def construct_data_store_params(json: dict): ds_params_instance = None ds_params = DataStoreParameters( json["store_type"], json["orchestrator_type"], json["orchestrator_params"] ) if json["store_type"] == "nfs": ds_params_instance = NFSDataStoreParameters( ds_params, checkpoint_dir=json["checkpoint_dir"] ) elif json["store_type"] == "s3": ds_params_instance = S3DataStoreParameters( ds_params=ds_params, end_point=json["end_point"], bucket_name=json["bucket_name"], checkpoint_dir=json["checkpoint_dir"], expt_dir=json["expt_dir"], ) elif json["store_type"] == "redis": ds_params_instance = RedisDataStoreParameters( ds_params, redis_address=json["redis_address"], redis_port=json["redis_port"], redis_channel=json["redis_channel"], ) else: raise ValueError("store_type {} was found, expected 'nfs', 'redis' or 's3'.") return ds_params_instance
def construct_data_store_params(json: dict): ds_params_instance = None ds_params = DataStoreParameters(json['store_type'], json['orchestrator_type'], json['orchestrator_params']) if json['store_type'] == 'nfs': ds_params_instance = NFSDataStoreParameters(ds_params) elif json['store_type'] == 's3': ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=json['end_point'], bucket_name=json['bucket_name'], checkpoint_dir=json['checkpoint_dir']) return ds_params_instance
def handle_distributed_coach_orchestrator(args): from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, \ RunTypeParameters ckpt_inside_container = "/checkpoint" arg_list = sys.argv[1:] try: i = arg_list.index('--distributed_coach_run_type') arg_list.pop(i) arg_list.pop(i) except ValueError: pass trainer_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER) ] + arg_list rollout_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER) ] + arg_list if '--experiment_name' not in rollout_command: rollout_command = rollout_command + [ '--experiment_name', args.experiment_name ] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + [ '--experiment_name', args.experiment_name ] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters( ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container, expt_dir=args.experiment_path) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str( RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters( [worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return 1 if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return 1 if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return 1 if args.dump_worker_logs: screen.log_title("Dumping rollout worker logs in: {}".format( args.experiment_path)) orchestrator.worker_logs(path=args.experiment_path) exit_code = 1 try: exit_code = orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy() return exit_code