def construct_memory_params(json: dict): if json['store_type'] == 'redispubsub': memory_params = RedisPubSubMemoryBackendParameters( json['redis_address'], json['redis_port'], channel=json.get('channel', ''), run_type=json['run_type'] ) return memory_params
def handle_distributed_coach_orchestrator(graph_manager, args): ckpt_inside_container = "/checkpoint" rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:] trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:] if '--experiment_name' not in rollout_command: rollout_command = rollout_command + ['--experiment_name', args.experiment_name] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + ['--experiment_name', args.experiment_name] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return try: orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy()
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument('-pk', '--preset_s3_key', help="(string) Name of a preset to download from S3", type=str, required=False) parser.add_argument( '-ek', '--environment_s3_key', help="(string) Name of an environment file to download from S3", type=str, required=False) parser.add_argument('--model_metadata_s3_key', help="(string) Model Metadata File S3 Key", type=str, required=False) parser.add_argument( '-c', '--checkpoint-dir', help= '(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument( '--pretrained-checkpoint-dir', help='(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get( "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("AWS_REGION", "us-east-1")) args, unknown = parser.parse_known_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) s3_client.upload_file( os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path) shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR) success_custom_environment = False if args.environment_s3_key: environment_local_path = "./markov/environments/deepracer_racetrack_env.py" success_custom_environment = s3_client.download_file( s3_key=args.environment_s3_key, local_path=environment_local_path) if not success_custom_environment: print( "Could not download the environment file. Using the default DeepRacer environment." ) else: success_custom_environment = s3_client.upload_file( s3_key=os.path.normpath( "%s/environments/deepracer_racetrack_env.py" % args.s3_prefix), local_path=environment_local_path) if success_custom_environment: print("Using environment: %s" % args.environment_s3_key) # Import to register the environment with Gym import markov.environments success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" success_custom_preset = s3_client.download_file( s3_key=args.preset_s3_key, local_path=preset_local_path) if not success_custom_preset: print( "Could not download the preset file. Using the default DeepRacer preset." ) else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) success_custom_preset = s3_client.upload_file( s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path) if success_custom_preset: print("Using preset: %s" % args.preset_s3_key) if not success_custom_preset: from markov.sagemaker_graph_manager import get_graph_manager params_blob = os.environ.get('SM_TRAINING_ENV', '') if params_blob: params = json.loads(params_blob) sm_hyperparams_dict = params["hyperparameters"] else: sm_hyperparams_dict = {} graph_manager, robomaker_hyperparams_json = get_graph_manager( **sm_hyperparams_dict) s3_client.upload_hyperparameters(robomaker_hyperparams_json) print("Uploaded hyperparameters.json to S3") host_ip_address = get_ip_from_host() s3_client.write_ip_config(host_ip_address) print("Uploaded IP address information to S3: %s" % host_ip_address) use_pretrained_model = False if args.pretrained_s3_bucket and args.pretrained_s3_prefix: s3_client_pretrained = SageS3Client( bucket=args.pretrained_s3_bucket, s3_prefix=args.pretrained_s3_prefix, aws_region=args.aws_region) s3_client_pretrained.download_model(args.pretrained_checkpoint_dir) use_pretrained_model = True memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address="localhost", redis_port=6379, run_type='trainer', channel=args.s3_prefix) graph_manager.agent_params.memory.register_var('memory_backend_params', memory_backend_params) ds_params_instance = S3BotoDataStoreParameters( bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, aws_region=args.aws_region, s3_folder=args.s3_prefix) graph_manager.data_store_params = ds_params_instance data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store training_worker(graph_manager=graph_manager, checkpoint_dir=args.checkpoint_dir, use_pretrained_model=use_pretrained_model, framework=args.framework)
def handle_distributed_coach_orchestrator(args): from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, \ RunTypeParameters ckpt_inside_container = "/checkpoint" arg_list = sys.argv[1:] try: i = arg_list.index('--distributed_coach_run_type') arg_list.pop(i) arg_list.pop(i) except ValueError: pass trainer_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER) ] + arg_list rollout_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER) ] + arg_list if '--experiment_name' not in rollout_command: rollout_command = rollout_command + [ '--experiment_name', args.experiment_name ] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + [ '--experiment_name', args.experiment_name ] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters( ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container, expt_dir=args.experiment_path) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str( RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters( [worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return 1 if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return 1 if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return 1 if args.dump_worker_logs: screen.log_title("Dumping rollout worker logs in: {}".format( args.experiment_path)) orchestrator.worker_logs(path=args.experiment_path) exit_code = 1 try: exit_code = orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy() return exit_code
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument('-pk', '--preset_s3_key', help="(string) Name of a preset to download from S3", type=str, required=False) parser.add_argument('-ek', '--environment_s3_key', help="(string) Name of an environment file to download from S3", type=str, required=False) parser.add_argument('--model_metadata_s3_key', help="(string) Model Metadata File S3 Key", type=str, required=False) parser.add_argument('-c', '--checkpoint-dir', help='(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument('--pretrained-checkpoint-dir', help='(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("AWS_REGION", "us-east-1")) start_redis_server() args, _ = parser.parse_known_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) s3_client.upload_file(os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path) shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR) success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path) if not success_custom_preset: logger.info("Could not download the preset file. Using the default DeepRacer preset.") else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) success_custom_preset = s3_client.upload_file( s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path) if success_custom_preset: logger.info("Using preset: %s" % args.preset_s3_key) if not success_custom_preset: params_blob = os.environ.get('SM_TRAINING_ENV', '') if params_blob: params = json.loads(params_blob) sm_hyperparams_dict = params["hyperparameters"] else: sm_hyperparams_dict = {} #! TODO each agent should have own config agent_config = {'model_metadata': model_metadata_local_path, 'car_ctrl_cnfig': {ConfigParams.LINK_NAME_LIST.value: [], ConfigParams.VELOCITY_LIST.value : {}, ConfigParams.STEERING_LIST.value : {}, ConfigParams.CHANGE_START.value : None, ConfigParams.ALT_DIR.value : None, ConfigParams.ACTION_SPACE_PATH.value : 'custom_files/model_metadata.json', ConfigParams.REWARD.value : None, ConfigParams.AGENT_NAME.value : 'racecar'}} agent_list = list() agent_list.append(create_training_agent(agent_config)) #agent_list.append(create_training_agent(agent_config)) graph_manager, robomaker_hyperparams_json = get_graph_manager(sm_hyperparams_dict, agent_list) s3_client.upload_hyperparameters(robomaker_hyperparams_json) logger.info("Uploaded hyperparameters.json to S3") host_ip_address = utils.get_ip_from_host() s3_client.write_ip_config(host_ip_address) logger.info("Uploaded IP address information to S3: %s" % host_ip_address) use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix if use_pretrained_model: # Handle backward compatibility _, _, version = parse_model_metadata(model_metadata_local_path) if float(version) < float(utils.SIMAPP_VERSION) and \ not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region): utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, SyncFiles.TRAINER_READY.value) ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region, bucket_name=args.pretrained_s3_bucket, checkpoint_dir=args.pretrained_checkpoint_dir, s3_folder=args.pretrained_s3_prefix) data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained) data_store_pretrained.load_from_store() memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address="localhost", redis_port=6379, run_type=str(RunType.TRAINER), channel=args.s3_prefix) graph_manager.memory_backend_params = memory_backend_params ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region, bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, s3_folder=args.s3_prefix) graph_manager.data_store_params = ds_params_instance data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store task_parameters = TaskParameters() task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR task_parameters.checkpoint_save_secs = 20 if use_pretrained_model: task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir task_parameters.checkpoint_save_dir = args.checkpoint_dir training_worker( graph_manager=graph_manager, task_parameters=task_parameters, user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"], user_episode_per_rollout=json.loads(robomaker_hyperparams_json)["num_episodes_between_training"] )
def main(): screen.set_use_colors(False) try: parser = argparse.ArgumentParser() parser.add_argument( '-pk', '--preset_s3_key', help="(string) Name of a preset to download from S3", type=str, required=False) parser.add_argument( '-ek', '--environment_s3_key', help="(string) Name of an environment file to download from S3", type=str, required=False) parser.add_argument('--model_metadata_s3_key', help="(string) Model Metadata File S3 Key", type=str, required=False) parser.add_argument( '-c', '--checkpoint-dir', help= '(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument( '--pretrained-checkpoint-dir', help= '(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get( "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("AWS_REGION", "us-east-1")) args, unknown = parser.parse_known_args() start_redis_server() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) s3_client.upload_file( os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path) shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR) # Register the gym enviroment, this will give clients the ability to creat the enviroment object register(id=defaults.ENV_ID, entry_point=defaults.ENTRY_POINT, max_episode_steps=defaults.MAX_STEPS, reward_threshold=defaults.THRESHOLD) user_batch_size, user_episode_per_rollout = None, None success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" success_custom_preset = s3_client.download_file( s3_key=args.preset_s3_key, local_path=preset_local_path) if not success_custom_preset: logger.info( "Could not download the preset file. Using the default DeepRacer preset." ) else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) success_custom_preset = s3_client.upload_file( s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path) if success_custom_preset: agent_param_loc = "markov.presets.preset:agent_params" agent_params = short_dynamic_import( agent_param_loc, ignore_module_case=True) user_batch_size = agent_params.network_wrappers[ 'main'].batch_size user_episode_per_rollout = agent_params.algorithm.num_consecutive_playing_steps.num_steps logger.info("Using preset: %s" % args.preset_s3_key) if not success_custom_preset: from markov.sagemaker_graph_manager import get_graph_manager user_batch_size = json.loads( robomaker_hyperparams_json)["batch_size"], user_episode_per_rollout = json.loads( robomaker_hyperparams_json)["num_episodes_between_training"] params_blob = os.environ.get('SM_TRAINING_ENV', '') if params_blob: params = json.loads(params_blob) sm_hyperparams_dict = params["hyperparameters"] else: sm_hyperparams_dict = {} graph_manager, robomaker_hyperparams_json = get_graph_manager( **sm_hyperparams_dict) s3_client.upload_hyperparameters(robomaker_hyperparams_json) logger.info("Uploaded hyperparameters.json to S3") host_ip_address = get_ip_from_host() s3_client.write_ip_config(host_ip_address) logger.info("Uploaded IP address information to S3: %s" % host_ip_address) use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix if use_pretrained_model: s3_client_pretrained = SageS3Client( bucket=args.pretrained_s3_bucket, s3_prefix=args.pretrained_s3_prefix, aws_region=args.aws_region) s3_client_pretrained.download_model(args.pretrained_checkpoint_dir) memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address="localhost", redis_port=6379, run_type='trainer', channel=args.s3_prefix) ds_params_instance = S3BotoDataStoreParameters( bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, aws_region=args.aws_region, s3_folder=args.s3_prefix) graph_manager.data_store_params = ds_params_instance data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store training_worker(graph_manager=graph_manager, checkpoint_dir=args.checkpoint_dir, use_pretrained_model=use_pretrained_model, framework=args.framework, memory_backend_params=memory_backend_params, user_batch_size=user_batch_size, user_episode_per_rollout=user_episode_per_rollout) except Exception as ex: utils.json_format_logger( "Training worker exited with exception: {}".format(ex), **utils.build_system_error_dict( utils.SIMAPP_TRAINING_WORKER_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_500)) utils.simapp_exit_gracefully()
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX", "sagemaker")) parser.add_argument( '--num-workers', help="(int) The number of workers started in this pool", type=int, default=1) parser.add_argument('-r', '--redis_ip', help="(string) IP or host for the redis server", default='localhost', type=str) parser.add_argument('-rp', '--redis_port', help="(int) Port of the redis server", default=6379, type=int) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("APP_REGION", "us-east-1")) parser.add_argument('--reward_file_s3_key', help='(string) Reward File S3 Key', type=str, default=os.environ.get("REWARD_FILE_S3_KEY", None)) parser.add_argument('--model_metadata_s3_key', help='(string) Model Metadata File S3 Key', type=str, default=os.environ.get("MODEL_METADATA_FILE_S3_KEY", None)) args = parser.parse_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) print("S3 bucket: %s" % args.s3_bucket) print("S3 prefix: %s" % args.s3_prefix) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) redis_ip = s3_client.get_ip() print("Received IP from SageMaker successfully: %s" % redis_ip) # Download hyperparameters from SageMaker hyperparameters_file_success = False hyperparams_s3_key = os.path.normpath(args.s3_prefix + "/ip/hyperparameters.json") hyperparameters_file_success = s3_client.download_file( s3_key=hyperparams_s3_key, local_path="hyperparameters.json") sm_hyperparams_dict = {} if hyperparameters_file_success: print("Received Sagemaker hyperparameters successfully!") with open("hyperparameters.json") as fp: sm_hyperparams_dict = json.load(fp) else: print("SageMaker hyperparameters not found.") preset_file_success = False environment_file_success = False preset_file_success, environment_file_success = download_custom_files_if_present( s3_client, args.s3_prefix) if not environment_file_success: # Download reward function if environment file is not downloaded if not args.reward_file_s3_key: raise ValueError("Customer reward S3 key not supplied!") download_customer_reward_function(s3_client, args.reward_file_s3_key) import markov.environments print("Using default environment!") else: register_custom_environments() print("Using custom environment!") if preset_file_success: preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py") preset_location += ":graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) print("Using custom preset file!") else: from markov.sagemaker_graph_manager import get_graph_manager graph_manager, _ = get_graph_manager(**sm_hyperparams_dict) memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address=redis_ip, redis_port=6379, run_type='worker', channel=args.s3_prefix) graph_manager.agent_params.memory.register_var('memory_backend_params', memory_backend_params) ds_params_instance = S3BotoDataStoreParameters( bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, aws_region=args.aws_region, s3_folder=args.s3_prefix) data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store rollout_worker( graph_manager=graph_manager, checkpoint_dir=args.checkpoint_dir, data_store=data_store, num_workers=args.num_workers, )
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX", "sagemaker")) parser.add_argument( '--num-workers', help="(int) The number of workers started in this pool", type=int, default=1) parser.add_argument('-r', '--redis_ip', help="(string) IP or host for the redis server", default='localhost', type=str) parser.add_argument('-rp', '--redis_port', help="(int) Port of the redis server", default=6379, type=int) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("APP_REGION", "us-east-1")) parser.add_argument('--reward_file_s3_key', help='(string) Reward File S3 Key', type=str, default=os.environ.get("REWARD_FILE_S3_KEY", None)) parser.add_argument('--model_metadata_s3_key', help='(string) Model Metadata File S3 Key', type=str, default=os.environ.get("MODEL_METADATA_FILE_S3_KEY", None)) parser.add_argument('--aws_endpoint_url', help='(string) AWS region', type=str, default=os.environ.get("AWS_ENDPOINT_URL", None)) args = parser.parse_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region, endpoint_url=args.aws_endpoint_url) logger.info("S3 bucket: %s" % args.s3_bucket) logger.info("S3 prefix: %s" % args.s3_prefix) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) # Download reward function if not args.reward_file_s3_key: utils.json_format_logger( "Reward function code S3 key not available for S3 bucket {} and prefix {}" .format(args.s3_bucket, args.s3_prefix), **utils.build_system_error_dict( utils.SIMAPP_SIMULATION_WORKER_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_500)) traceback.print_exc() utils.simapp_exit_gracefully() download_customer_reward_function(s3_client, args.reward_file_s3_key) # Register the gym enviroment, this will give clients the ability to creat the enviroment object register(id=defaults.ENV_ID, entry_point=defaults.ENTRY_POINT, max_episode_steps=defaults.MAX_STEPS, reward_threshold=defaults.THRESHOLD) redis_ip = s3_client.get_ip() logger.info("Received IP from SageMaker successfully: %s" % redis_ip) # Download hyperparameters from SageMaker hyperparameters_file_success = False hyperparams_s3_key = os.path.normpath(args.s3_prefix + "/ip/hyperparameters.json") hyperparameters_file_success = s3_client.download_file( s3_key=hyperparams_s3_key, local_path="hyperparameters.json") sm_hyperparams_dict = {} if hyperparameters_file_success: logger.info("Received Sagemaker hyperparameters successfully!") with open("hyperparameters.json") as fp: sm_hyperparams_dict = json.load(fp) else: logger.info("SageMaker hyperparameters not found.") preset_file_success, _ = download_custom_files_if_present( s3_client, args.s3_prefix) if preset_file_success: preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py") preset_location += ":graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) logger.info("Using custom preset file!") else: from markov.sagemaker_graph_manager import get_graph_manager graph_manager, _ = get_graph_manager(**sm_hyperparams_dict) logger.info("Connecting to redis at %s:%d" % (redis_ip, args.redis_port)) memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address=redis_ip, redis_port=6379, run_type='worker', channel=args.s3_prefix) logger.info("Connecting to s3 boto data store at %s" % args.aws_endpoint_url) ds_params_instance = S3BotoDataStoreParameters( bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, aws_region=args.aws_region, s3_folder=args.s3_prefix, aws_endpoint_url=args.aws_endpoint_url) data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store rollout_worker(graph_manager=graph_manager, checkpoint_dir=args.checkpoint_dir, data_store=data_store, num_workers=args.num_workers, memory_backend_params=memory_backend_params)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--markov-preset-file', help="(string) Name of a preset file to run in Markov's preset directory.", type=str, default=os.environ.get("MARKOV_PRESET_FILE", "object_tracker.py")) parser.add_argument('-c', '--local-model-directory', help='(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default=os.environ.get("LOCAL_MODEL_DIRECTORY", "./checkpoint")) parser.add_argument('-n', '--num-rollout-workers', help="(int) Number of workers for multi-process based agents, e.g. A3C", default=os.environ.get("NUMBER_OF_ROLLOUT_WORKERS", 1), type=int) parser.add_argument('--model-s3-bucket', help='(string) S3 bucket where trained models are stored. It contains model checkpoints.', type=str, default=os.environ.get("MODEL_S3_BUCKET")) parser.add_argument('--model-s3-prefix', help='(string) S3 prefix where trained models are stored. It contains model checkpoints.', type=str, default=os.environ.get("MODEL_S3_PREFIX")) parser.add_argument('--aws-region', help='(string) AWS region', type=str, default=os.environ.get("ROS_AWS_REGION", "us-west-2")) args = parser.parse_args() data_store_params_instance = S3BotoDataStoreParameters(bucket_name=args.model_s3_bucket, s3_folder=args.model_s3_prefix, checkpoint_dir=args.local_model_directory, aws_region=args.aws_region) data_store = S3BotoDataStore(data_store_params_instance) # Get the IP of the trainer machine trainer_ip = data_store.get_ip() print("Received IP from SageMaker successfully: %s" % trainer_ip) preset_file_success = data_store.download_presets_if_present(PRESET_LOCAL_PATH) if preset_file_success: environment_file_success = data_store.download_environments_if_present(ENVIRONMENT_LOCAL_PATH) path_and_module = PRESET_LOCAL_PATH + args.markov_preset_file + ":graph_manager" graph_manager = short_dynamic_import(path_and_module, ignore_module_case=True) if environment_file_success: import robomaker.environments print("Using custom preset file!") elif args.markov_preset_file: markov_path = imp.find_module("markov")[1] preset_location = os.path.join(markov_path, "presets", args.markov_preset_file) path_and_module = preset_location + ":graph_manager" graph_manager = short_dynamic_import(path_and_module, ignore_module_case=True) print("Using custom preset file from Markov presets directory!") else: raise ValueError("Unable to determine preset file") memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address=trainer_ip, redis_port=TRAINER_REDIS_PORT, run_type='worker', channel=args.model_s3_prefix) graph_manager.agent_params.memory.register_var('memory_backend_params', memory_backend_params) graph_manager.data_store_params = data_store_params_instance graph_manager.data_store = data_store utils.wait_for_checkpoint(checkpoint_dir=args.local_model_directory, data_store=data_store) rollout_worker( graph_manager=graph_manager, checkpoint_dir=args.local_model_directory, data_store=data_store, num_workers=args.num_rollout_workers )
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint-dir', help= '(string) Path to a local folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument( '--pretrained-checkpoint-dir', help= '(string) Path to a local folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get( "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument('--RLCOACH_PRESET', help='(string) Default preset to use', type=str, default='object_tracker') parser.add_argument('--aws_region', help='(string) AWS region', type=str, required=True) args, unknown = parser.parse_known_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) # Import to register the environment with Gym import robomaker.environments preset_location = "robomaker.presets.%s:graph_manager" % args.RLCOACH_PRESET graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) host_ip_address = get_ip_from_host() s3_client.write_ip_config(host_ip_address) print("Uploaded IP address information to S3: %s" % host_ip_address) use_pretrained_model = False if args.pretrained_s3_bucket and args.pretrained_s3_prefix: s3_client_pretrained = SageS3Client( bucket=args.pretrained_s3_bucket, s3_prefix=args.pretrained_s3_prefix, aws_region=args.aws_region) s3_client_pretrained.download_model(PRETRAINED_MODEL_DIR) use_pretrained_model = True memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address="localhost", redis_port=6379, run_type='trainer', channel=args.s3_prefix) graph_manager.agent_params.memory.register_var('memory_backend_params', memory_backend_params) ds_params_instance = S3BotoDataStoreParameters( bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, s3_folder=args.s3_prefix, aws_region=args.aws_region) graph_manager.data_store_params = ds_params_instance data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store training_worker(graph_manager=graph_manager, checkpoint_dir=args.checkpoint_dir, use_pretrained_model=use_pretrained_model, framework=args.framework)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX", "sagemaker")) parser.add_argument('--s3_endpoint_url', help='(string) S3 endpoint URL', type=str, default=rospy.get_param("S3_ENDPOINT_URL", None)) parser.add_argument( '--num-workers', help="(int) The number of workers started in this pool", type=int, default=1) parser.add_argument('-r', '--redis_ip', help="(string) IP or host for the redis server", default='localhost', type=str) parser.add_argument('-rp', '--redis_port', help="(int) Port of the redis server", default=6379, type=int) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=rospy.get_param("AWS_REGION", "us-east-1")) parser.add_argument('--reward_file_s3_key', help='(string) Reward File S3 Key', type=str, default=rospy.get_param("REWARD_FILE_S3_KEY", None)) parser.add_argument('--model_metadata_s3_key', help='(string) Model Metadata File S3 Key', type=str, default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY", None)) # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and # collision_penalty will all be 0 be default parser.add_argument('--number_of_resets', help='(integer) Number of resets', type=int, default=int(rospy.get_param("NUMBER_OF_RESETS", 0))) parser.add_argument('--penalty_seconds', help='(float) penalty second', type=float, default=float(rospy.get_param("PENALTY_SECONDS", 0.0))) parser.add_argument('--job_type', help='(string) job type', type=str, default=rospy.get_param("JOB_TYPE", "TRAINING")) parser.add_argument('--is_continuous', help='(boolean) is continous after lap completion', type=bool, default=utils.str2bool( rospy.get_param("IS_CONTINUOUS", False))) parser.add_argument('--race_type', help='(string) Race type', type=str, default=rospy.get_param("RACE_TYPE", "TIME_TRIAL")) parser.add_argument('--off_track_penalty', help='(float) off track penalty second', type=float, default=float(rospy.get_param("OFF_TRACK_PENALTY", 0.0))) parser.add_argument('--collision_penalty', help='(float) collision penalty second', type=float, default=float(rospy.get_param("COLLISION_PENALTY", 0.0))) args = parser.parse_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) logger.info("S3 bucket: %s", args.s3_bucket) logger.info("S3 prefix: %s", args.s3_prefix) logger.info("S3 endpoint URL: %s" % args.s3_endpoint_url) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) # Download and import reward function if not args.reward_file_s3_key: log_and_exit( "Reward function code S3 key not available for S3 bucket {} and prefix {}" .format(args.s3_bucket, args.s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) download_customer_reward_function(s3_client, args.reward_file_s3_key) try: from custom_files.customer_reward_function import reward_function except Exception as e: log_and_exit("Failed to import user's reward_function: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) # Instantiate Cameras configure_camera(namespaces=['racecar']) preset_file_success, _ = download_custom_files_if_present( s3_client, args.s3_prefix) #! TODO each agent should have own config _, _, version = utils_parse_model_metadata.parse_model_metadata( model_metadata_local_path) agent_config = { 'model_metadata': model_metadata_local_path, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: LINK_NAMES, ConfigParams.VELOCITY_LIST.value: VELOCITY_TOPICS, ConfigParams.STEERING_LIST.value: STEERING_TOPICS, ConfigParams.CHANGE_START.value: utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)), ConfigParams.ALT_DIR.value: utils.str2bool( rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)), ConfigParams.ACTION_SPACE_PATH.value: 'custom_files/model_metadata.json', ConfigParams.REWARD.value: reward_function, ConfigParams.AGENT_NAME.value: 'racecar', ConfigParams.VERSION.value: version, ConfigParams.NUMBER_OF_RESETS.value: args.number_of_resets, ConfigParams.PENALTY_SECONDS.value: args.penalty_seconds, ConfigParams.NUMBER_OF_TRIALS.value: None, ConfigParams.IS_CONTINUOUS.value: args.is_continuous, ConfigParams.RACE_TYPE.value: args.race_type, ConfigParams.COLLISION_PENALTY.value: args.collision_penalty, ConfigParams.OFF_TRACK_PENALTY.value: args.off_track_penalty } } #! TODO each agent should have own s3 bucket metrics_s3_config = { MetricsS3Keys.METRICS_BUCKET.value: rospy.get_param('METRICS_S3_BUCKET'), MetricsS3Keys.METRICS_KEY.value: rospy.get_param('METRICS_S3_OBJECT_KEY'), MetricsS3Keys.ENDPOINT_URL.value: rospy.get_param('S3_ENDPOINT_URL', None), MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION'), MetricsS3Keys.STEP_BUCKET.value: rospy.get_param('SAGEMAKER_SHARED_S3_BUCKET'), MetricsS3Keys.STEP_KEY.value: os.path.join(rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX'), TRAINING_SIMTRACE_DATA_S3_OBJECT_KEY) } metrics_s3_model_cfg = { MetricsS3Keys.METRICS_BUCKET.value: args.s3_bucket, MetricsS3Keys.METRICS_KEY.value: os.path.join(args.s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX), MetricsS3Keys.REGION.value: args.aws_region } run_phase_subject = RunPhaseSubject() agent_list = list() agent_list.append( create_rollout_agent( agent_config, TrainingMetrics('agent', metrics_s3_config, metrics_s3_model_cfg, args.checkpoint_dir, run_phase_subject), run_phase_subject)) agent_list.append(create_obstacles_agent()) agent_list.append(create_bot_cars_agent()) # ROS service to indicate all the robomaker markov packages are ready for consumption signal_robomaker_markov_package_ready() PhaseObserver('/agent/training_phase', run_phase_subject) aws_region = rospy.get_param('AWS_REGION', args.aws_region) simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None) mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None) if simtrace_s3_bucket: simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX') if mp4_s3_bucket: mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX') s3_writer_job_info = [] if simtrace_s3_bucket: s3_writer_job_info.append( IterationData( 'simtrace', simtrace_s3_bucket, simtrace_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames.SIM_TRACE_TRAINING_LOCAL_FILE. value))) if mp4_s3_bucket: s3_writer_job_info.extend([ IterationData( 'pip', mp4_s3_bucket, mp4_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames. CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)), IterationData( '45degree', mp4_s3_bucket, mp4_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames. CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)), IterationData( 'topview', mp4_s3_bucket, mp4_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames. CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value)) ]) s3_writer = S3Writer(job_info=s3_writer_job_info, s3_endpoint_url=args.s3_endpoint_url) redis_ip = s3_client.get_ip() logger.info("Received IP from SageMaker successfully: %s", redis_ip) # Download hyperparameters from SageMaker hyperparameters_file_success = False hyperparams_s3_key = os.path.normpath(args.s3_prefix + "/ip/hyperparameters.json") hyperparameters_file_success = s3_client.download_file( s3_key=hyperparams_s3_key, local_path="hyperparameters.json") sm_hyperparams_dict = {} if hyperparameters_file_success: logger.info("Received Sagemaker hyperparameters successfully!") with open("hyperparameters.json") as filepointer: sm_hyperparams_dict = json.load(filepointer) else: logger.info("SageMaker hyperparameters not found.") if preset_file_success: preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py") preset_location += ":graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) logger.info("Using custom preset file!") else: graph_manager, _ = get_graph_manager( hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=run_phase_subject) memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address=redis_ip, redis_port=6379, run_type=str(RunType.ROLLOUT_WORKER), channel=args.s3_prefix) graph_manager.memory_backend_params = memory_backend_params ds_params_instance = S3BotoDataStoreParameters( aws_region=args.aws_region, bucket_names={'agent': args.s3_bucket}, base_checkpoint_dir=args.checkpoint_dir, s3_folders={'agent': args.s3_prefix}, s3_endpoint_url=args.s3_endpoint_url) graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) task_parameters = TaskParameters() task_parameters.checkpoint_restore_path = args.checkpoint_dir rollout_worker(graph_manager=graph_manager, num_workers=args.num_workers, task_parameters=task_parameters, s3_writer=s3_writer)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX", "sagemaker")) parser.add_argument( '--num-workers', help="(int) The number of workers started in this pool", type=int, default=1) parser.add_argument('-r', '--redis_ip', help="(string) IP or host for the redis server", default='localhost', type=str) parser.add_argument('-rp', '--redis_port', help="(int) Port of the redis server", default=6379, type=int) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=rospy.get_param("AWS_REGION", "us-east-1")) parser.add_argument('--reward_file_s3_key', help='(string) Reward File S3 Key', type=str, default=rospy.get_param("REWARD_FILE_S3_KEY", None)) parser.add_argument('--model_metadata_s3_key', help='(string) Model Metadata File S3 Key', type=str, default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY", None)) args = parser.parse_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) logger.info("S3 bucket: %s" % args.s3_bucket) logger.info("S3 prefix: %s" % args.s3_prefix) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) # Download and import reward function if not args.reward_file_s3_key: utils.log_and_exit( "Reward function code S3 key not available for S3 bucket {} and prefix {}" .format(args.s3_bucket, args.s3_prefix), utils.SIMAPP_SIMULATION_WORKER_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_500) download_customer_reward_function(s3_client, args.reward_file_s3_key) try: from custom_files.customer_reward_function import reward_function except Exception as e: utils.log_and_exit( "Failed to import user's reward_function: {}".format(e), utils.SIMAPP_SIMULATION_WORKER_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_400) # Instantiate Cameras configure_camera() redis_ip = s3_client.get_ip() logger.info("Received IP from SageMaker successfully: %s" % redis_ip) # Download hyperparameters from SageMaker hyperparameters_file_success = False hyperparams_s3_key = os.path.normpath(args.s3_prefix + "/ip/hyperparameters.json") hyperparameters_file_success = s3_client.download_file( s3_key=hyperparams_s3_key, local_path="hyperparameters.json") sm_hyperparams_dict = {} if hyperparameters_file_success: logger.info("Received Sagemaker hyperparameters successfully!") with open("hyperparameters.json") as fp: sm_hyperparams_dict = json.load(fp) else: logger.info("SageMaker hyperparameters not found.") preset_file_success, _ = download_custom_files_if_present( s3_client, args.s3_prefix) #! TODO each agent should have own config _, _, version = utils_parse_model_metadata.parse_model_metadata( model_metadata_local_path) agent_config = { 'model_metadata': model_metadata_local_path, 'car_ctrl_cnfig': { ConfigParams.LINK_NAME_LIST.value: LINK_NAMES, ConfigParams.VELOCITY_LIST.value: VELOCITY_TOPICS, ConfigParams.STEERING_LIST.value: STEERING_TOPICS, ConfigParams.CHANGE_START.value: utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)), ConfigParams.ALT_DIR.value: utils.str2bool( rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)), ConfigParams.ACTION_SPACE_PATH.value: 'custom_files/model_metadata.json', ConfigParams.REWARD.value: reward_function, ConfigParams.AGENT_NAME.value: 'racecar', ConfigParams.VERSION.value: version } } #! TODO each agent should have own s3 bucket metrics_s3_config = { MetricsS3Keys.METRICS_BUCKET.value: rospy.get_param('METRICS_S3_BUCKET'), MetricsS3Keys.METRICS_KEY.value: rospy.get_param('METRICS_S3_OBJECT_KEY'), MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION'), MetricsS3Keys.STEP_BUCKET.value: rospy.get_param('SAGEMAKER_SHARED_S3_BUCKET'), MetricsS3Keys.STEP_KEY.value: os.path.join(rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX'), TRAINING_SIMTRACE_DATA_S3_OBJECT_KEY) } agent_list = list() agent_list.append( create_rollout_agent(agent_config, TrainingMetrics(metrics_s3_config))) agent_list.append(create_obstacles_agent()) agent_list.append(create_bot_cars_agent()) if preset_file_success: preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py") preset_location += ":graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) logger.info("Using custom preset file!") else: graph_manager, _ = get_graph_manager(sm_hyperparams_dict, agent_list) memory_backend_params = RedisPubSubMemoryBackendParameters( redis_address=redis_ip, redis_port=6379, run_type=str(RunType.ROLLOUT_WORKER), channel=args.s3_prefix) graph_manager.memory_backend_params = memory_backend_params ds_params_instance = S3BotoDataStoreParameters( aws_region=args.aws_region, bucket_name=args.s3_bucket, checkpoint_dir=args.checkpoint_dir, s3_folder=args.s3_prefix) data_store = S3BotoDataStore(ds_params_instance) data_store.graph_manager = graph_manager graph_manager.data_store = data_store task_parameters = TaskParameters() task_parameters.checkpoint_restore_path = args.checkpoint_dir rollout_worker(graph_manager=graph_manager, data_store=data_store, num_workers=args.num_workers, task_parameters=task_parameters)