def evaluation_worker(graph_manager, data_store, number_of_trials, task_parameters): checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, data_store) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics') rospy.wait_for_service('/gazebo/unpause_physics') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) # Instantiate Cameras configure_camera() unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) for _ in range(number_of_trials): graph_manager.evaluate(EnvironmentSteps(1)) # Close the down the job utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION'))
def main(): """ Main function for tournament""" try: # parse argument s3_region = sys.argv[1] s3_bucket = sys.argv[2] s3_prefix = sys.argv[3] s3_yaml_name = sys.argv[4] # create boto3 session/client and download yaml/json file session = boto3.session.Session() s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None) s3_client = S3Client(region_name=s3_region, s3_endpoint_url=s3_endpoint_url) # Intermediate tournament files queue_pickle_name = 'tournament_candidate_queue.pkl' queue_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, queue_pickle_name)) local_queue_pickle_path = os.path.abspath( os.path.join(os.getcwd(), queue_pickle_name)) report_pickle_name = 'tournament_report.pkl' report_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, report_pickle_name)) local_report_pickle_path = os.path.abspath( os.path.join(os.getcwd(), report_pickle_name)) final_report_name = 'tournament_report.json' final_report_s3_key = os.path.normpath( os.path.join(s3_prefix, final_report_name)) try: s3_client.download_file(bucket=s3_bucket, s3_key=queue_pickle_s3_key, local_path=local_queue_pickle_path) s3_client.download_file(bucket=s3_bucket, s3_key=report_pickle_s3_key, local_path=local_report_pickle_path) except: pass # download yaml file yaml_file = YamlFile( agent_type=AgentType.TOURNAMENT.value, bucket=s3_bucket, s3_key=get_s3_key(s3_prefix, s3_yaml_name), region_name=s3_region, s3_endpoint_url=s3_endpoint_url, local_path=YAML_LOCAL_PATH_FORMAT.format(s3_yaml_name)) yaml_dict = yaml_file.get_yaml_values() if os.path.exists(local_queue_pickle_path): with open(local_queue_pickle_path, 'rb') as f: tournament_candidate_queue = pickle.load(f) with open(local_report_pickle_path, 'rb') as f: tournament_report = pickle.load(f) logger.info('tournament_candidate_queue loaded from existing file') else: logger.info('tournament_candidate_queue initialized') tournament_candidate_queue = deque() for agent_idx, _ in enumerate( yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value]): tournament_candidate_queue.append(( yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MODEL_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MODEL_METADATA_FILE_S3_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.METRICS_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.METRICS_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.SIMTRACE_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.SIMTRACE_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MP4_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MP4_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.DISPLAY_NAME_YAML_KEY.value][agent_idx], # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check "" if None in yaml_dict.get(YamlKey.RACER_NAME_YAML_KEY.value, [None]) \ else yaml_dict[YamlKey.RACER_NAME_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.BODY_SHELL_TYPE_YAML_KEY.value][agent_idx] )) tournament_report = {"race_results": []} race_idx = len(tournament_report["race_results"]) while len(tournament_candidate_queue) > 1: car1 = tournament_candidate_queue.popleft() car2 = tournament_candidate_queue.popleft() (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata, car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket, car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix, car1_display_name, car1_racer_name, car1_body_shell_type) = car1 (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata, car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket, car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix, car2_display_name, car2_racer_name, car2_body_shell_type) = car2 race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict, car1=car1, car2=car2, race_idx=race_idx) if s3_endpoint_url is not None: race_yaml_dict["S3_ENDPOINT_URL"] = s3_endpoint_url race_model_s3_buckets = [ car1_model_s3_bucket, car2_model_s3_bucket ] race_model_metadatas = [car1_model_metadata, car2_model_metadata] body_shell_types = [car1_body_shell_type, car2_body_shell_type] # List of directories created dirs_to_delete = list() yaml_dir = os.path.abspath(os.path.join(os.getcwd(), str(race_idx))) os.makedirs(yaml_dir) dirs_to_delete.append(yaml_dir) race_yaml_path = os.path.abspath( os.path.join(yaml_dir, 'evaluation_params.yaml')) with open(race_yaml_path, 'w') as race_yaml_file: yaml.dump(race_yaml_dict, race_yaml_file) # List of racecar names that should include second camera while launching racecars_with_stereo_cameras = list() # List of racecar names that should include lidar while launching racecars_with_lidars = list() # List of SimApp versions simapp_versions = list() for agent_index, model_s3_bucket in enumerate( race_model_s3_buckets): racecar_name = 'racecar_' + str(agent_index) json_key = race_model_metadatas[agent_index] # download model metadata try: model_metadata = ModelMetadata( bucket=model_s3_bucket, s3_key=json_key, region_name=s3_region, s3_endpoint_url=s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format( racecar_name)) dirs_to_delete.append(model_metadata.local_dir) except Exception as e: log_and_exit( "Failed to download model_metadata file: s3_bucket: {}, s3_key: {}, {}" .format(model_s3_bucket, json_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) sensors, _, simapp_version = model_metadata.get_model_metadata_info( ) simapp_versions.append(str(simapp_version)) if Input.STEREO.value in sensors: racecars_with_stereo_cameras.append(racecar_name) if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors: racecars_with_lidars.append(racecar_name) cmd = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "tournament_race_node.py"), str(race_idx), race_yaml_path, ','.join(racecars_with_stereo_cameras), ','.join(racecars_with_lidars), ','.join(simapp_versions), ','.join(body_shell_types) ] try: return_code, _, stderr = run_cmd(cmd_args=cmd, shell=False, stdout=None, stderr=None) except KeyboardInterrupt: logger.info( "KeyboardInterrupt raised, SimApp must be faulted! exiting..." ) return # Retrieve winner and append tournament report with open('race_report.pkl', 'rb') as f: race_report = pickle.load(f) race_report['race_idx'] = race_idx winner = car1 if race_report[ 'winner'] == car1_display_name else car2 logger.info("race {}'s winner: {}".format(race_idx, race_report['winner'])) tournament_candidate_queue.append(winner) tournament_report["race_results"].append(race_report) # Clean up directories created for dir_to_delete in dirs_to_delete: shutil.rmtree(dir_to_delete, ignore_errors=True) race_idx += 1 s3_extra_args = get_s3_kms_extra_args() # Persist latest queue and report to use after job restarts. with open(local_queue_pickle_path, 'wb') as f: pickle.dump(tournament_candidate_queue, f, protocol=2) s3_client.upload_file(bucket=s3_bucket, s3_key=queue_pickle_s3_key, local_path=local_queue_pickle_path, s3_kms_extra_args=s3_extra_args) with open(local_report_pickle_path, 'wb') as f: pickle.dump(tournament_report, f, protocol=2) s3_client.upload_file(bucket=s3_bucket, s3_key=report_pickle_s3_key, local_path=local_report_pickle_path, s3_kms_extra_args=s3_extra_args) # If there is more than 1 candidates then restart the simulation job otherwise # tournament is finished, persists final report and ends the job. if len(tournament_candidate_queue) > 1: restart_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) break else: # Persist final tournament report in json format # and terminate the job by canceling it s3_client.put_object(bucket=s3_bucket, s3_key=final_report_s3_key, body=json.dumps(tournament_report), s3_kms_extra_args=s3_extra_args) cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) except ValueError as ex: log_and_exit("User modified model_metadata.json: {}".format(ex), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as e: log_and_exit("Tournament node failed: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX", "sagemaker")) parser.add_argument( '--num_workers', help="(int) The number of workers started in this pool", type=int, default=int(rospy.get_param("NUM_WORKERS", 1))) parser.add_argument('--rollout_idx', help="(int) The index of current rollout worker", type=int, default=0) parser.add_argument('-r', '--redis_ip', help="(string) IP or host for the redis server", default='localhost', type=str) parser.add_argument('-rp', '--redis_port', help="(int) Port of the redis server", default=6379, type=int) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=rospy.get_param("AWS_REGION", "us-east-1")) parser.add_argument('--reward_file_s3_key', help='(string) Reward File S3 Key', type=str, default=rospy.get_param("REWARD_FILE_S3_KEY", None)) parser.add_argument('--model_metadata_s3_key', help='(string) Model Metadata File S3 Key', type=str, default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY", None)) # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and # collision_penalty will all be 0 be default parser.add_argument('--number_of_resets', help='(integer) Number of resets', type=int, default=int(rospy.get_param("NUMBER_OF_RESETS", 0))) parser.add_argument('--penalty_seconds', help='(float) penalty second', type=float, default=float(rospy.get_param("PENALTY_SECONDS", 0.0))) parser.add_argument('--job_type', help='(string) job type', type=str, default=rospy.get_param("JOB_TYPE", "TRAINING")) parser.add_argument('--is_continuous', help='(boolean) is continous after lap completion', type=bool, default=utils.str2bool( rospy.get_param("IS_CONTINUOUS", False))) parser.add_argument('--race_type', help='(string) Race type', type=str, default=rospy.get_param("RACE_TYPE", "TIME_TRIAL")) parser.add_argument('--off_track_penalty', help='(float) off track penalty second', type=float, default=float(rospy.get_param("OFF_TRACK_PENALTY", 0.0))) parser.add_argument('--collision_penalty', help='(float) collision penalty second', type=float, default=float(rospy.get_param("COLLISION_PENALTY", 0.0))) args = parser.parse_args() s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) logger.info("S3 bucket: %s", args.s3_bucket) logger.info("S3 prefix: %s", args.s3_prefix) # Load the model metadata model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json') utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path) # Download and import reward function if not args.reward_file_s3_key: log_and_exit( "Reward function code S3 key not available for S3 bucket {} and prefix {}" .format(args.s3_bucket, args.s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) download_customer_reward_function(s3_client, args.reward_file_s3_key) try: from custom_files.customer_reward_function import reward_function except Exception as e: log_and_exit("Failed to import user's reward_function: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) # Instantiate Cameras configure_camera(namespaces=['racecar']) preset_file_success, _ = download_custom_files_if_present( s3_client, args.s3_prefix) #! TODO each agent should have own config _, _, version = utils_parse_model_metadata.parse_model_metadata( model_metadata_local_path) agent_config = { 'model_metadata': model_metadata_local_path, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: LINK_NAMES, ConfigParams.VELOCITY_LIST.value: VELOCITY_TOPICS, ConfigParams.STEERING_LIST.value: STEERING_TOPICS, ConfigParams.CHANGE_START.value: utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)), ConfigParams.ALT_DIR.value: utils.str2bool( rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)), ConfigParams.ACTION_SPACE_PATH.value: 'custom_files/model_metadata.json', ConfigParams.REWARD.value: reward_function, ConfigParams.AGENT_NAME.value: 'racecar', ConfigParams.VERSION.value: version, ConfigParams.NUMBER_OF_RESETS.value: args.number_of_resets, ConfigParams.PENALTY_SECONDS.value: args.penalty_seconds, ConfigParams.NUMBER_OF_TRIALS.value: None, ConfigParams.IS_CONTINUOUS.value: args.is_continuous, ConfigParams.RACE_TYPE.value: args.race_type, ConfigParams.COLLISION_PENALTY.value: args.collision_penalty, ConfigParams.OFF_TRACK_PENALTY.value: args.off_track_penalty } } #! TODO each agent should have own s3 bucket step_metrics_prefix = rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX') if args.num_workers > 1: step_metrics_prefix = os.path.join(step_metrics_prefix, str(args.rollout_idx)) metrics_s3_config = { MetricsS3Keys.METRICS_BUCKET.value: rospy.get_param('METRICS_S3_BUCKET'), MetricsS3Keys.METRICS_KEY.value: rospy.get_param('METRICS_S3_OBJECT_KEY'), MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION') } metrics_s3_model_cfg = { MetricsS3Keys.METRICS_BUCKET.value: args.s3_bucket, MetricsS3Keys.METRICS_KEY.value: os.path.join(args.s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX), MetricsS3Keys.REGION.value: args.aws_region } run_phase_subject = RunPhaseSubject() agent_list = list() agent_list.append( create_rollout_agent( agent_config, TrainingMetrics(agent_name='agent', s3_dict_metrics=metrics_s3_config, s3_dict_model=metrics_s3_model_cfg, ckpnt_dir=args.checkpoint_dir, run_phase_sink=run_phase_subject, use_model_picker=(args.rollout_idx == 0)), run_phase_subject)) agent_list.append(create_obstacles_agent()) agent_list.append(create_bot_cars_agent()) # ROS service to indicate all the robomaker markov packages are ready for consumption signal_robomaker_markov_package_ready() PhaseObserver('/agent/training_phase', run_phase_subject) aws_region = rospy.get_param('AWS_REGION', args.aws_region) simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None) mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None) if args.rollout_idx == 0 else None if simtrace_s3_bucket: simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX') if args.num_workers > 1: simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix, str(args.rollout_idx)) if mp4_s3_bucket: mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX') s3_writer_job_info = [] if simtrace_s3_bucket: s3_writer_job_info.append( IterationData( 'simtrace', simtrace_s3_bucket, simtrace_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames.SIM_TRACE_TRAINING_LOCAL_FILE. value))) if mp4_s3_bucket: s3_writer_job_info.extend([ IterationData( 'pip', mp4_s3_bucket, mp4_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames. CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)), IterationData( '45degree', mp4_s3_bucket, mp4_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames. CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)), IterationData( 'topview', mp4_s3_bucket, mp4_s3_object_prefix, aws_region, os.path.join( ITERATION_DATA_LOCAL_FILE_PATH, 'agent', IterationDataLocalFileNames. CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value)) ]) s3_writer = S3Writer(job_info=s3_writer_job_info) redis_ip = s3_client.get_ip() logger.info("Received IP from SageMaker successfully: %s", redis_ip) # Download hyperparameters from SageMaker hyperparameters_file_success = False hyperparams_s3_key = os.path.normpath(args.s3_prefix + "/ip/hyperparameters.json") hyperparameters_file_success = s3_client.download_file( s3_key=hyperparams_s3_key, local_path="hyperparameters.json") sm_hyperparams_dict = {} if hyperparameters_file_success: logger.info("Received Sagemaker hyperparameters successfully!") with open("hyperparameters.json") as filepointer: sm_hyperparams_dict = json.load(filepointer) else: logger.info("SageMaker hyperparameters not found.") enable_domain_randomization = utils.str2bool( rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False)) if preset_file_success: preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py") preset_location += ":graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) logger.info("Using custom preset file!") else: graph_manager, _ = get_graph_manager( hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=run_phase_subject, enable_domain_randomization=enable_domain_randomization) # If num_episodes_between_training is smaller than num_workers then cancel worker early. episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps # Reduce number of workers if allocated more than num_episodes_between_training if args.num_workers > episode_steps_per_rollout: logger.info( "Excess worker allocated. Reducing from {} to {}...".format( args.num_workers, episode_steps_per_rollout)) args.num_workers = episode_steps_per_rollout if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers: err_msg_format = "Exiting excess worker..." err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])" logger.info( err_msg_format.format(args.rollout_idx, args.num_workers, episode_steps_per_rollout)) # Close the down the job utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION')) memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters( redis_address=redis_ip, redis_port=6379, run_type=str(RunType.ROLLOUT_WORKER), channel=args.s3_prefix, num_workers=args.num_workers, rollout_idx=args.rollout_idx) graph_manager.memory_backend_params = memory_backend_params ds_params_instance = S3BotoDataStoreParameters( aws_region=args.aws_region, bucket_names={'agent': args.s3_bucket}, base_checkpoint_dir=args.checkpoint_dir, s3_folders={'agent': args.s3_prefix}) graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) task_parameters = TaskParameters() task_parameters.checkpoint_restore_path = args.checkpoint_dir rollout_worker(graph_manager=graph_manager, num_workers=args.num_workers, rollout_idx=args.rollout_idx, task_parameters=task_parameters, s3_writer=s3_writer)
def evaluation_worker(graph_manager, number_of_trials, task_parameters, s3_writers, is_continuous): """ Evaluation worker function Arguments: graph_manager {[MultiAgentGraphManager]} -- [Graph manager of multiagent graph manager] number_of_trials {[int]} -- [Number of trails you want to run the evaluation] task_parameters {[TaskParameters]} -- [Information of the checkpoint, gpu/cpu, framework etc of rlcoach] s3_writers {[S3Writer]} -- [Information to upload to the S3 bucket all the simtrace and mp4] is_continuous {bool} -- [The termination condition for the car] """ checkpoint_dirs = list() agent_names = list() subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list( ), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: _checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1\ else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name) agent_names.append(agent_param.name) checkpoint_dirs.append(_checkpoint_dir) racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1\ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append( "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append( "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store) modify_checkpoint_variables(checkpoint_dirs, agent_names) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics') rospy.wait_for_service('/gazebo/unpause_physics') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): rospy.wait_for_service(mp4_sub) rospy.wait_for_service(mp4_unsub) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty)) unsubscribe_from_save_mp4.append(ServiceProxyWrapper(mp4_unsub, Empty)) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) logger.info( "Graph manager successfully created the graph: Unpausing physics") unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) if is_save_mp4_enabled: for subscribe_mp4 in subscribe_to_save_mp4: subscribe_mp4(EmptyRequest()) if is_continuous: graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(number_of_trials): graph_manager.evaluate(EnvironmentSteps(1)) if is_save_mp4_enabled: for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4(EmptyRequest()) for s3_writer in s3_writers: s3_writer.upload_to_s3() time.sleep(1) pause_physics(EmptyRequest()) # Close the down the job utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION'))
def main(): """ Main function for tournament""" try: # parse argument s3_region = sys.argv[1] s3_bucket = sys.argv[2] s3_prefix = sys.argv[3] s3_yaml_name = sys.argv[4] # create boto3 session/client and download yaml/json file session = boto3.session.Session() s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None) s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url, config=get_boto_config()) yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) local_yaml_path = os.path.abspath( os.path.join(os.getcwd(), s3_yaml_name)) s3_client.download_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) # Intermediate tournament files queue_pickle_name = 'tournament_candidate_queue.pkl' queue_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, queue_pickle_name)) local_queue_pickle_path = os.path.abspath( os.path.join(os.getcwd(), queue_pickle_name)) report_pickle_name = 'tournament_report.pkl' report_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, report_pickle_name)) local_report_pickle_path = os.path.abspath( os.path.join(os.getcwd(), report_pickle_name)) final_report_name = 'tournament_report.json' final_report_s3_key = os.path.normpath( os.path.join(s3_prefix, final_report_name)) try: s3_client.download_file(Bucket=s3_bucket, Key=queue_pickle_s3_key, Filename=local_queue_pickle_path) s3_client.download_file(Bucket=s3_bucket, Key=report_pickle_s3_key, Filename=local_report_pickle_path) except: pass # Get values passed in yaml files. Default values are for backward compatibility and for single racecar racing yaml_dict = get_yaml_dict(local_yaml_path) # Forcing the yaml parameter to list force_list_params = [ MODEL_S3_BUCKET_YAML_KEY, MODEL_S3_PREFIX_YAML_KEY, MODEL_METADATA_FILE_S3_YAML_KEY, METRICS_S3_BUCKET_YAML_KEY, METRICS_S3_PREFIX_YAML_KEY, SIMTRACE_S3_BUCKET_YAML_KEY, SIMTRACE_S3_PREFIX_YAML_KEY, MP4_S3_BUCKET_YAML_KEY, MP4_S3_PREFIX_YAML_KEY, DISPLAY_NAME_YAML_KEY ] for params in force_list_params: yaml_dict[params] = force_list(yaml_dict.get(params, None)) # Populate the model_metadata_s3_key values to handle both training and evaluation for all race_formats if None in yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY]: # MODEL_METADATA_FILE_S3_KEY not passed as part of yaml file ==> This happens during evaluation # Assume model_metadata.json is present in the s3_prefix/model/ folder yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY] = list() for s3_prefix in yaml_dict[MODEL_S3_PREFIX_YAML_KEY]: yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY].append( os.path.join(s3_prefix, 'model/model_metadata.json')) # Validate the yaml values validate_yaml_values(yaml_dict) if os.path.exists(local_queue_pickle_path): with open(local_queue_pickle_path, 'rb') as f: tournament_candidate_queue = pickle.load(f) with open(local_report_pickle_path, 'rb') as f: tournament_report = pickle.load(f) logger.info('tournament_candidate_queue loaded from existing file') else: logger.info('tournament_candidate_queue initialized') tournament_candidate_queue = deque() for agent_idx, _ in enumerate(yaml_dict[MODEL_S3_BUCKET_YAML_KEY]): tournament_candidate_queue.append( (yaml_dict[MODEL_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[MODEL_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY][agent_idx], yaml_dict[METRICS_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[METRICS_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[SIMTRACE_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[SIMTRACE_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[MP4_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[MP4_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[DISPLAY_NAME_YAML_KEY][agent_idx])) tournament_report = [] race_idx = len(tournament_report) while len(tournament_candidate_queue) > 1: car1 = tournament_candidate_queue.popleft() car2 = tournament_candidate_queue.popleft() (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata, car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket, car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix, car1_display_name) = car1 (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata, car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket, car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix, car2_display_name) = car2 race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict, car1=car1, car2=car2, race_idx=race_idx) race_car_colors = ["Orange", "Purple"] race_model_s3_buckets = [ car1_model_s3_bucket, car2_model_s3_bucket ] race_model_metadatas = [car1_model_metadata, car2_model_metadata] # List of directories created dirs_to_delete = list() yaml_dir = os.path.abspath(os.path.join(os.getcwd(), str(race_idx))) os.makedirs(yaml_dir) dirs_to_delete.append(yaml_dir) race_yaml_path = os.path.abspath( os.path.join(yaml_dir, 'evaluation_params.yaml')) with open(race_yaml_path, 'w') as race_yaml_file: yaml.dump(race_yaml_dict, race_yaml_file) # List of racecar names that should include second camera while launching racecars_with_stereo_cameras = list() # List of racecar names that should include lidar while launching racecars_with_lidars = list() # List of SimApp versions simapp_versions = list() for agent_index, model_s3_bucket in enumerate( race_model_s3_buckets): racecar_name = 'racecar_' + str(agent_index) # Make a local folder with the racecar name to download the model_metadata.json os.makedirs(os.path.join(os.getcwd(), racecar_name)) dirs_to_delete.append(os.path.join(os.getcwd(), racecar_name)) local_model_metadata_path = os.path.abspath( os.path.join(os.path.join(os.getcwd(), racecar_name), 'model_metadata.json')) json_key = race_model_metadatas[agent_index] json_key = json_key.replace('s3://{}/'.format(model_s3_bucket), '') s3_client.download_file(Bucket=model_s3_bucket, Key=json_key, Filename=local_model_metadata_path) sensors, _, simapp_version = utils_parse_model_metadata.parse_model_metadata( local_model_metadata_path) simapp_versions.append(simapp_version) if Input.STEREO.value in sensors: racecars_with_stereo_cameras.append(racecar_name) if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors: racecars_with_lidars.append(racecar_name) cmd = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "tournament_race_node.py"), str(race_idx), race_yaml_path, ','.join(racecars_with_stereo_cameras), ','.join(racecars_with_lidars), ','.join(race_car_colors), ','.join(simapp_versions) ] try: return_code, _, stderr = run_cmd(cmd_args=cmd, shell=False, stdout=None, stderr=None) except KeyboardInterrupt: logger.info( "KeyboardInterrupt raised, SimApp must be faulted! exiting..." ) return # Retrieve winner and append tournament report with open('race_report.pkl', 'rb') as f: race_report = pickle.load(f) race_report['race_idx'] = race_idx winner = car1 if race_report[ 'winner'] == car1_display_name else car2 logger.info("race {}'s winner: {}".format(race_idx, race_report['winner'])) tournament_candidate_queue.append(winner) tournament_report.append(race_report) # Clean up directories created for dir_to_delete in dirs_to_delete: shutil.rmtree(dir_to_delete, ignore_errors=True) race_idx += 1 # Persist latest queue and report to use after job restarts. with open(local_queue_pickle_path, 'wb') as f: pickle.dump(tournament_candidate_queue, f, protocol=2) s3_client.upload_file(Filename=local_queue_pickle_path, Bucket=s3_bucket, Key=queue_pickle_s3_key) with open(local_report_pickle_path, 'wb') as f: pickle.dump(tournament_report, f, protocol=2) s3_client.upload_file(Filename=local_report_pickle_path, Bucket=s3_bucket, Key=report_pickle_s3_key) # If there is more than 1 candidates then restart the simulation job otherwise # tournament is finished, persists final report and ends the job. if len(tournament_candidate_queue) > 1: restart_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) break else: # Persist final tournament report in json format # and terminate the job by canceling it s3_client.put_object(Bucket=s3_bucket, Key=final_report_s3_key, Body=json.dumps(tournament_report)) cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) except Exception as e: log_and_exit( "Tournament node failed: s3_bucket: {}, yaml_key: {}, {}".format( s3_bucket, yaml_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX", "sagemaker")) parser.add_argument( '--num_workers', help="(int) The number of workers started in this pool", type=int, default=int(rospy.get_param("NUM_WORKERS", 1))) parser.add_argument('--rollout_idx', help="(int) The index of current rollout worker", type=int, default=0) parser.add_argument('-r', '--redis_ip', help="(string) IP or host for the redis server", default='localhost', type=str) parser.add_argument('-rp', '--redis_port', help="(int) Port of the redis server", default=6379, type=int) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=rospy.get_param("AWS_REGION", "us-east-1")) parser.add_argument('--reward_file_s3_key', help='(string) Reward File S3 Key', type=str, default=rospy.get_param("REWARD_FILE_S3_KEY", None)) parser.add_argument('--model_metadata_s3_key', help='(string) Model Metadata File S3 Key', type=str, default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY", None)) # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and # collision_penalty will all be 0 be default parser.add_argument('--number_of_resets', help='(integer) Number of resets', type=int, default=int(rospy.get_param("NUMBER_OF_RESETS", 0))) parser.add_argument('--penalty_seconds', help='(float) penalty second', type=float, default=float(rospy.get_param("PENALTY_SECONDS", 0.0))) parser.add_argument('--job_type', help='(string) job type', type=str, default=rospy.get_param("JOB_TYPE", "TRAINING")) parser.add_argument('--is_continuous', help='(boolean) is continous after lap completion', type=bool, default=utils.str2bool( rospy.get_param("IS_CONTINUOUS", False))) parser.add_argument('--race_type', help='(string) Race type', type=str, default=rospy.get_param("RACE_TYPE", "TIME_TRIAL")) parser.add_argument('--off_track_penalty', help='(float) off track penalty second', type=float, default=float(rospy.get_param("OFF_TRACK_PENALTY", 0.0))) parser.add_argument('--collision_penalty', help='(float) collision penalty second', type=float, default=float(rospy.get_param("COLLISION_PENALTY", 0.0))) args = parser.parse_args() logger.info("S3 bucket: %s", args.s3_bucket) logger.info("S3 prefix: %s", args.s3_prefix) # Download and import reward function # TODO: replace 'agent' with name of each agent for multi-agent training reward_function_file = RewardFunction( bucket=args.s3_bucket, s3_key=args.reward_file_s3_key, region_name=args.aws_region, local_path=REWARD_FUCTION_LOCAL_PATH_FORMAT.format('agent')) reward_function = reward_function_file.get_reward_function() # Instantiate Cameras configure_camera(namespaces=['racecar']) preset_file_success, _ = download_custom_files_if_present( s3_bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region) # download model metadata # TODO: replace 'agent' with name of each agent model_metadata = ModelMetadata( bucket=args.s3_bucket, s3_key=args.model_metadata_s3_key, region_name=args.aws_region, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) model_metadata_info = model_metadata.get_model_metadata_info() version = model_metadata_info[ModelMetadataKeys.VERSION.value] agent_config = { 'model_metadata': model_metadata, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: LINK_NAMES, ConfigParams.VELOCITY_LIST.value: VELOCITY_TOPICS, ConfigParams.STEERING_LIST.value: STEERING_TOPICS, ConfigParams.CHANGE_START.value: utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)), ConfigParams.ALT_DIR.value: utils.str2bool( rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)), ConfigParams.MODEL_METADATA.value: model_metadata, ConfigParams.REWARD.value: reward_function, ConfigParams.AGENT_NAME.value: 'racecar', ConfigParams.VERSION.value: version, ConfigParams.NUMBER_OF_RESETS.value: args.number_of_resets, ConfigParams.PENALTY_SECONDS.value: args.penalty_seconds, ConfigParams.NUMBER_OF_TRIALS.value: None, ConfigParams.IS_CONTINUOUS.value: args.is_continuous, ConfigParams.RACE_TYPE.value: args.race_type, ConfigParams.COLLISION_PENALTY.value: args.collision_penalty, ConfigParams.OFF_TRACK_PENALTY.value: args.off_track_penalty } } #! TODO each agent should have own s3 bucket metrics_key = rospy.get_param('METRICS_S3_OBJECT_KEY') if args.num_workers > 1 and args.rollout_idx > 0: key_tuple = os.path.splitext(metrics_key) metrics_key = "{}_{}{}".format(key_tuple[0], str(args.rollout_idx), key_tuple[1]) metrics_s3_config = { MetricsS3Keys.METRICS_BUCKET.value: rospy.get_param('METRICS_S3_BUCKET'), MetricsS3Keys.METRICS_KEY.value: metrics_key, MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION') } run_phase_subject = RunPhaseSubject() agent_list = list() #TODO: replace agent for multi agent training # checkpoint s3 instance # TODO replace agent with agent_0 and so on for multiagent case checkpoint = Checkpoint(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, agent_name='agent', checkpoint_dir=args.checkpoint_dir) agent_list.append( create_rollout_agent( agent_config, TrainingMetrics( agent_name='agent', s3_dict_metrics=metrics_s3_config, deepracer_checkpoint_json=checkpoint.deepracer_checkpoint_json, ckpnt_dir=os.path.join(args.checkpoint_dir, 'agent'), run_phase_sink=run_phase_subject, use_model_picker=(args.rollout_idx == 0)), run_phase_subject)) agent_list.append(create_obstacles_agent()) agent_list.append(create_bot_cars_agent()) # ROS service to indicate all the robomaker markov packages are ready for consumption signal_robomaker_markov_package_ready() PhaseObserver('/agent/training_phase', run_phase_subject) aws_region = rospy.get_param('AWS_REGION', args.aws_region) simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None) mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None) if args.rollout_idx == 0 else None if simtrace_s3_bucket: simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX') if args.num_workers > 1: simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix, str(args.rollout_idx)) if mp4_s3_bucket: mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX') simtrace_video_s3_writers = [] #TODO: replace 'agent' with 'agent_0' for multi agent training and # mp4_s3_object_prefix, mp4_s3_bucket will be a list, so need to access with index if simtrace_s3_bucket: simtrace_video_s3_writers.append( SimtraceVideo( upload_type=SimtraceVideoNames.SIMTRACE_TRAINING.value, bucket=simtrace_s3_bucket, s3_prefix=simtrace_s3_object_prefix, region_name=aws_region, local_path=SIMTRACE_TRAINING_LOCAL_PATH_FORMAT.format( 'agent'))) if mp4_s3_bucket: simtrace_video_s3_writers.extend([ SimtraceVideo( upload_type=SimtraceVideoNames.PIP.value, bucket=mp4_s3_bucket, s3_prefix=mp4_s3_object_prefix, region_name=aws_region, local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format('agent')), SimtraceVideo( upload_type=SimtraceVideoNames.DEGREE45.value, bucket=mp4_s3_bucket, s3_prefix=mp4_s3_object_prefix, region_name=aws_region, local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format('agent')), SimtraceVideo( upload_type=SimtraceVideoNames.TOPVIEW.value, bucket=mp4_s3_bucket, s3_prefix=mp4_s3_object_prefix, region_name=aws_region, local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format('agent')) ]) # TODO: replace 'agent' with specific agent name for multi agent training ip_config = IpConfig(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, local_path=IP_ADDRESS_LOCAL_PATH.format('agent')) redis_ip = ip_config.get_ip_config() # Download hyperparameters from SageMaker shared s3 bucket # TODO: replace 'agent' with name of each agent hyperparameters = Hyperparameters( bucket=args.s3_bucket, s3_key=get_s3_key(args.s3_prefix, HYPERPARAMETER_S3_POSTFIX), region_name=args.aws_region, local_path=HYPERPARAMETER_LOCAL_PATH_FORMAT.format('agent')) sm_hyperparams_dict = hyperparameters.get_hyperparameters_dict() enable_domain_randomization = utils.str2bool( rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False)) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) if preset_file_success: preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py") preset_location += ":graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) logger.info("Using custom preset file!") else: graph_manager, _ = get_graph_manager( hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=run_phase_subject, enable_domain_randomization=enable_domain_randomization, pause_physics=pause_physics, unpause_physics=unpause_physics) # If num_episodes_between_training is smaller than num_workers then cancel worker early. episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps # Reduce number of workers if allocated more than num_episodes_between_training if args.num_workers > episode_steps_per_rollout: logger.info( "Excess worker allocated. Reducing from {} to {}...".format( args.num_workers, episode_steps_per_rollout)) args.num_workers = episode_steps_per_rollout if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers: err_msg_format = "Exiting excess worker..." err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])" logger.info( err_msg_format.format(args.rollout_idx, args.num_workers, episode_steps_per_rollout)) # Close the down the job utils.cancel_simulation_job() memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters( redis_address=redis_ip, redis_port=6379, run_type=str(RunType.ROLLOUT_WORKER), channel=args.s3_prefix, num_workers=args.num_workers, rollout_idx=args.rollout_idx) graph_manager.memory_backend_params = memory_backend_params checkpoint_dict = {'agent': checkpoint} ds_params_instance = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) task_parameters = TaskParameters() task_parameters.checkpoint_restore_path = args.checkpoint_dir rollout_worker(graph_manager=graph_manager, num_workers=args.num_workers, rollout_idx=args.rollout_idx, task_parameters=task_parameters, simtrace_video_s3_writers=simtrace_video_s3_writers, pause_physics=pause_physics, unpause_physics=unpause_physics)
def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous, park_positions, race_type, pause_physics, unpause_physics): """ Evaluation worker function Arguments: graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager number_of_trials(int): Number of trails you want to run the evaluation task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu, framework etc of rlcoach simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4 is_continuous(bool): The termination condition for the car park_positions(list of tuple): list of (x, y) for cars to park at race_type (str): race type """ # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list( ), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append( "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append( "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.modify_checkpoint_variables() # wait for the required cancel services to become available if race_type != RaceType.F1.value: # TODO: Since we are not running Grand Prix in RoboMaker, # we are opting out from waiting for RoboMaker's cancel job service # in case of Grand Prix execution. # Otherwise, SimApp will hang as service will never come alive. # # If we don't depend on RoboMaker anymore in the future, # we need to remove below line, or do a better job to figure out # whether we are running on RoboMaker or not to decide whether # we should wait for below service or not. rospy.wait_for_service('/robomaker/job/cancel') # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): rospy.wait_for_service(mp4_sub) rospy.wait_for_service(mp4_unsub) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty)) unsubscribe_from_save_mp4.append( Thread(target=ServiceProxyWrapper(mp4_unsub, Empty), args=(EmptyRequest(), ))) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) logger.info( "Graph manager successfully created the graph: Unpausing physics") unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) if is_save_mp4_enabled: for subscribe_mp4 in subscribe_to_save_mp4: subscribe_mp4(EmptyRequest()) configure_environment_randomizer() track_data = TrackData.get_instance() # Before each evaluation episode (single lap for non-continuous race and complete race for # continuous race), a new copy of park_positions needs to be loaded into track_data because # a park position will be pop from park_positions when a racer car need to be parked. if is_continuous: track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(number_of_trials): track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) if is_save_mp4_enabled: for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4.start() for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4.join() # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) time.sleep(1) pause_physics(EmptyRequest()) if race_type != RaceType.F1.value: # Close the down the job utils.cancel_simulation_job()
def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous, park_positions): """ Evaluation worker function Arguments: graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager number_of_trials(int): Number of trails you want to run the evaluation task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu, framework etc of rlcoach simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4 is_continuous(bool): The termination condition for the car park_positions(list of tuple): list of (x, y) for cars to park at """ # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list( ), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append( "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append( "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.modify_checkpoint_variables() # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): rospy.wait_for_service(mp4_sub) rospy.wait_for_service(mp4_unsub) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty)) unsubscribe_from_save_mp4.append( ServiceProxyWrapper(mp4_unsub, Empty)) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) logger.info( "Graph manager successfully created the graph: Unpausing physics") unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) if is_save_mp4_enabled: for subscribe_mp4 in subscribe_to_save_mp4: subscribe_mp4(EmptyRequest()) configure_environment_randomizer() track_data = TrackData.get_instance() # Before each evaluation episode (single lap for non-continuous race and complete race for # continuous race), a new copy of park_positions needs to be loaded into track_data because # a park position will be pop from park_positions when a racer car need to be parked. if is_continuous: track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(number_of_trials): track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) if is_save_mp4_enabled: for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) time.sleep(1) pause_physics(EmptyRequest()) # Close the down the job utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION'))
def main(): """ Main function for virutal event manager """ parser = argparse.ArgumentParser() parser.add_argument( '--queue_url', help='the sqs queue url to receive next racer information', type=str, default=str(rospy.get_param("SQS_QUEUE_URL", "sqs_queue_url"))) parser.add_argument('--race_duration', help='the length of the race in seconds.', type=int, default=int( rospy.get_param("RACE_DURATION", DEFAULT_RACE_DURATION))) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=rospy.get_param("AWS_REGION", "us-east-1")) parser.add_argument('--number_of_trials', help='(integer) Number of trials', type=int, default=int(rospy.get_param("NUMBER_OF_TRIALS", 3))) parser.add_argument('--number_of_resets', help='(integer) Number of resets', type=int, default=int(rospy.get_param("NUMBER_OF_RESETS", 0))) parser.add_argument('--penalty_seconds', help='(float) penalty second', type=float, default=float(rospy.get_param("PENALTY_SECONDS", 2.0))) parser.add_argument('--off_track_penalty', help='(float) off track penalty second', type=float, default=float(rospy.get_param("OFF_TRACK_PENALTY", 2.0))) parser.add_argument('--collision_penalty', help='(float) collision penalty second', type=float, default=float(rospy.get_param("COLLISION_PENALTY", 5.0))) parser.add_argument('--is_continuous', help='(boolean) is continous after lap completion', type=bool, default=utils.str2bool( rospy.get_param("IS_CONTINUOUS", False))) parser.add_argument('--race_type', help='(string) Race type', type=str, default=rospy.get_param("RACE_TYPE", "TIME_TRIAL")) parser.add_argument('--body_shell_type', help='(string) body shell type', type=str, default=rospy.get_param("BODY_SHELL_TYPE", "deepracer")) args = parser.parse_args() manager = VirtualEventManager(queue_url=args.queue_url, aws_region=args.aws_region, race_duration=args.race_duration, number_of_trials=args.number_of_trials, number_of_resets=args.number_of_resets, penalty_seconds=args.penalty_seconds, off_track_penalty=args.off_track_penalty, collision_penalty=args.collision_penalty, is_continuous=args.is_continuous, race_type=args.race_type, body_shell_type=args.body_shell_type) while True: # poll for next racer if not manager.is_event_end and manager.current_racer is None: LOG.info("[virtual event worker] polling for next racer.") manager.poll_next_racer() # if event end signal received, break out loop and finish the job if manager.is_event_end: LOG.info("[virtual event worker] received event end.") break # Setting up the race environment if manager.setup_race(): # proceed with start and finish race only if setup is successful. # Start race manager.start_race() # Finish race manager.finish_race() utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION'))