def _save_tf_model_to_store(self, checkpoint): # rl coach .coach_checkpoint state file state_file = checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file # upload tensorflow models checkpoint.tensorflow_model.persist( coach_checkpoint_state_file=state_file, s3_kms_extra_args=get_s3_kms_extra_args()) # persist rl coach checkpoint checkpoint.rl_coach_checkpoint.persist( s3_kms_extra_args=get_s3_kms_extra_args()) # Upload the frozen graph which is used for deployment if self.graph_manager: checkpoint.tensorflow_model.persist_tensorflow_frozen_graph( agent_name=checkpoint.agent_name, graph_manager=self.graph_manager, coach_checkpoint_state_file=state_file, best_checkpoint_number=checkpoint.deepracer_checkpoint_json. get_deepracer_best_checkpoint_number(), last_checkpoint_number=checkpoint.deepracer_checkpoint_json. get_deepracer_last_checkpoint_number(), s3_kms_extra_args=get_s3_kms_extra_args()) # Clean up old checkpoints checkpoint.tensorflow_model.delete( coach_checkpoint_state_file=state_file, best_checkpoint=checkpoint.deepracer_checkpoint_json. get_deepracer_best_checkpoint())
def flush_finished(self): """upload rl coach .finished file""" try: # remove lock file if it exists self.syncfile_lock.delete() # acquire lock self.syncfile_lock.persist(s3_kms_extra_args=get_s3_kms_extra_args()) for _, checkpoint in self.params.checkpoint_dict.items(): # upload .finished checkpoint.syncfile_finished.persist(s3_kms_extra_args=get_s3_kms_extra_args()) # release lock by delete it self.syncfile_lock.delete() except botocore.exceptions.ClientError: log_and_exit( "Unable to upload .finished", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400, ) except Exception as ex: log_and_exit( "Exception in uploading .finished file: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500, )
def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): checkpoint = graph_manager.data_store.params.checkpoint_dict['agent'] checkpoint_dir = task_parameters.checkpoint_restore_path graph_manager.data_store.wait_for_checkpoints() # validate last checkpoint last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint( ) if checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=last_model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()): screen.log_title(" Validating Last Checkpoint: {}".format( last_model_checkpoint_name)) # load the last rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Last Checkpoint completed!") # validate best checkpoint: Best checkpoint might not exist. best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint( ) if checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=best_model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()): screen.log_title(" Validating Best Checkpoint: {}".format( best_model_checkpoint_name)) # load the best rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() screen.log_title( " Start emulate_act_on_trainer on Best Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " emulate_act_on_trainer on Best Checkpoint completed!") else: screen.log_title(" No Best Checkpoint to validate.") else: screen.log_title(" Validating Last Checkpoint") # load the last rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title( " Start emulate_act_on_trainer on Last Checkpoint completed!") screen.log_title(" Validation completed!")
def __init__(self, params: S3BotoDataStoreParameters, graph_manager: MultiAgentGraphManager, ignore_lock: bool = False): self.params = params self.key_prefixes = dict() self.ip_data_keys = dict() self.ip_done_keys = dict() self.preset_data_keys = dict() self.delete_queues = dict() for agent_key, s3_folder in self.params.s3_folders.items(): self.key_prefixes[agent_key] = os.path.join(s3_folder, "model") self.ip_data_keys[agent_key] = os.path.join( s3_folder, "ip/ip.json") self.ip_done_keys[agent_key] = os.path.join(s3_folder, "ip/done") self.preset_data_keys[agent_key] = os.path.join( s3_folder, "presets/preset.py") self.delete_queues[agent_key] = queue.Queue() if not graph_manager: log_and_exit("None type for graph manager", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self.graph_manager = graph_manager self.ignore_lock = ignore_lock self.s3_extra_args = get_s3_kms_extra_args()
def write_metrics_to_s3(bucket, key, region, metrics): '''Helper method that uploads the desired metrics to s3 bucket - String with S3 bucket where metrics should be written key - String with S3 bucket key where metrics should be written region - String with aws region metrics - Dictionary with metrics to write to s3 ''' try: s3_extra_args = get_s3_kms_extra_args() session = boto3.session.Session() s3_client = session.client('s3', region_name=region, config=get_boto_config()) s3_client.put_object(Bucket=bucket, Key=key, Body=bytes(json.dumps(metrics), encoding='utf-8'), **s3_extra_args) except botocore.exceptions.ClientError as err: log_and_exit( "Unable to write metrics to s3: bucket: {}, error: {}".format( bucket, err.response['Error']['Code']), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as ex: log_and_exit("Unable to write metrics to s3, exception: {}".format(ex), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def make_compatible(self, syncfile_ready): """update coach checkpoint file to make it compatible Args: syncfile_ready (RlCoachSyncFile): RlCoachSyncFile class instance for .ready file """ try: # download old coach checkpoint self._s3_client.download_file(bucket=self._bucket, s3_key=self._old_s3_key, local_path=self._old_local_path) # parse old coach checkpoint with open(self._old_local_path) as old_coach_checkpoint_file: coach_checkpoint_value = re.findall( r'"(.*?)"', old_coach_checkpoint_file.readline()) if len(coach_checkpoint_value) != 1: log_and_exit( "No checkpoint file found", SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400, ) # remove old local coach checkpoint os.remove(self._old_local_path) # Upload ready file so that the system can gab the checkpoints syncfile_ready.persist(s3_kms_extra_args=get_s3_kms_extra_args()) # write new temp coach checkpoint file with open(self._temp_local_path, "w+") as new_coach_checkpoint_file: new_coach_checkpoint_file.write(coach_checkpoint_value[0]) # upload new temp coach checkpoint file self._persist_temp_coach_checkpoint( s3_kms_extra_args=get_s3_kms_extra_args()) # remove new temp local coach checkpoint os.remove(self._temp_local_path) except botocore.exceptions.ClientError as e: log_and_exit( "Unable to make model compatible: {}, {}".format( self._bucket, e.response["Error"]["Code"]), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400, ) except Exception as e: log_and_exit( "Exception in making model compatible: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500, )
def update(self, data): self._is_eval_ = data != RunPhase.TRAIN if not self._is_eval_ and self._use_model_picker: if self._eval_stats_dict_['chkpnt_name'] is None: self._eval_stats_dict_[ 'chkpnt_name'] = self._checkpoint_state_.read().name self._eval_trials_ = 0 mean_metric = statistics.mean( self._current_eval_best_model_metric_list_ ) if self._current_eval_best_model_metric_list_ else None msg_format = '[BestModelSelection] Number of evaluations: {} Evaluation episode {}: {}' LOGGER.info( msg_format.format( len(self._current_eval_best_model_metric_list_), self._best_model_metric_type.value, self._current_eval_best_model_metric_list_)) LOGGER.info( '[BestModelSelection] Evaluation episode {} mean: {}'.format( self._best_model_metric_type.value, mean_metric)) self._current_eval_best_model_metric_list_.clear() time_stamp = self._current_sim_time if self._eval_stats_dict_['avg_eval_metric'] is None or \ mean_metric >= self._eval_stats_dict_['avg_eval_metric']: msg_format = '[BestModelSelection] current {0} mean: {1} >= best {0} mean: {2}' LOGGER.info( msg_format.format( self._best_model_metric_type.value, mean_metric, self._eval_stats_dict_['avg_eval_metric'])) msg_format = '[BestModelSelection] Updating the best checkpoint to "{}" from "{}".' LOGGER.info( msg_format.format(self._eval_stats_dict_['chkpnt_name'], self._best_chkpnt_stats['name'])) self._eval_stats_dict_['avg_eval_metric'] = mean_metric self._best_chkpnt_stats = { 'name': self._eval_stats_dict_['chkpnt_name'], 'avg_eval_metric': mean_metric, 'time_stamp': time_stamp } last_chkpnt_stats = { 'name': self._eval_stats_dict_['chkpnt_name'], 'avg_eval_metric': mean_metric, 'time_stamp': time_stamp } self._deepracer_checkpoint_json.persist( body=json.dumps({ BEST_CHECKPOINT: self._best_chkpnt_stats, LAST_CHECKPOINT: last_chkpnt_stats }), s3_kms_extra_args=get_s3_kms_extra_args()) # Update the checkpoint name to the new checkpoint being used for training that will # then be evaluated, note this class gets notfied when the system is put into a # training phase and assumes that a training phase only starts when a new check point # is avaialble self._eval_stats_dict_[ 'chkpnt_name'] = self._checkpoint_state_.read().name
def __init__(self, bucket=None, s3_prefix=None, aws_region=None): self.aws_region = aws_region self.bucket = bucket self.s3_prefix = s3_prefix self.config_key = os.path.normpath(s3_prefix + "/ip/ip.json") self.hyperparameters_key = os.path.normpath(s3_prefix + "/ip/hyperparameters.json") self.done_file_key = os.path.normpath(s3_prefix + "/ip/done") self.model_checkpoints_prefix = os.path.normpath(s3_prefix + "/model/") + "/" self.s3_extra_args = get_s3_kms_extra_args() LOG.info("Initializing SageS3Client...")
def __init__(self, s3_bucket, s3_key, s3_endpoint_url=None): logger.info("simtrace_data init") DeepRacerRacetrackSimTraceData.__instance = self self.data_state = SIMTRACE_DATA_UPLOAD_UNKNOWN_STATE self.s3_bucket = s3_bucket self.s3_object_key = s3_key self.s3_endpoint_url = s3_endpoint_url if s3_key != "None": self.setup_mutipart_upload() self.s3_extra_args = utils.get_s3_kms_extra_args()
def upload_episode_metrics(self): # TODO: Service team can't handle "version" key in Evaluation Metrics due to # unknown key in the json. Training metrics change works fine as the metrics.json # file is directly loaded in Front-end Console while evaluation metrics file is loaded through # Service API and Service can't handle the keys in metrics file that is not defined in the service. # Keeping evaluation metrics as it is (without version key) as there is no change in the format anyway. # But we should make change in the future to match the format with Training metrics. json_metrics = json.dumps({'metrics': self._metrics_}) self._s3_metrics.persist(body=json_metrics, s3_kms_extra_args=get_s3_kms_extra_args())
def _update_sector_times(self, info_dict, sector_idx): """update curent personal, best personal, and sector best times. If there is a sector best time, upload the sector best time into s3 in a separted thread Args: info_dict(dict): infomation dictionary contains all necesary info to update sector times sector_idx(int): sector index for sector1 index is 0 and so on so forth Returns: dict: updated info_dict """ sector = SECTOR_X_FORMAT.format(sector_idx + 1) curr_eval_time = info_dict[ VirtualEventMP4Params.TOTAL_EVAL_SECONDS.value] last_eval_time = info_dict[ VirtualEventMP4Params.LAST_EVAL_SECONDS.value] # get sector_time_dict sector_time_dict = info_dict[VirtualEventMP4Params.SECTOR_TIMES.value] sector_time_dict[self._current_personal_format.format(sector)] \ = curr_eval_time - last_eval_time info_dict[ VirtualEventMP4Params.LAST_EVAL_SECONDS.value] = curr_eval_time if sector_time_dict[self._best_session_format.format( sector)] is not None: # update sector best personal time if sector_time_dict[self._current_personal_format.format(sector)] <= \ sector_time_dict[self._best_personal_format.format(sector)]: sector_time_dict[self._best_personal_format.format(sector)] = \ sector_time_dict[self._current_personal_format.format(sector)] # update sector best session time if sector_time_dict[self._current_personal_format.format(sector)] <= \ sector_time_dict[self._best_session_format.format(sector)]: sector_time_dict[self._best_session_format.format(sector)] = \ sector_time_dict[self._current_personal_format.format(sector)] # persist the updated sector best session time with # other sectors into s3 for robomaker crash backup # in a new thread Thread(target=self._virtual_event_best_sector_time.persist, args=(json.dumps({ SECTOR_X_FORMAT.format(idx + 1): sector_time_dict[self._best_session_format.format( SECTOR_X_FORMAT.format(idx + 1))] for idx in range(self._total_sectors) }), get_s3_kms_extra_args())).start() # update sector_time_dict to the latest info_dict[VirtualEventMP4Params.SECTOR_TIMES.value].update( sector_time_dict) return info_dict
def signal_ready(self): '''upload rl coach .ready file ''' try: # remove lock file if it exists self.syncfile_lock.delete() # acquire lock self.syncfile_lock.persist( s3_kms_extra_args=get_s3_kms_extra_args()) for _, checkpoint in self.params.checkpoint_dict.items(): # upload .ready checkpoint.syncfile_ready.persist( s3_kms_extra_args=get_s3_kms_extra_args()) # release lock by delete it self.syncfile_lock.delete() except botocore.exceptions.ClientError: log_and_exit("Unable to upload .ready", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as ex: log_and_exit("Exception in uploading .ready file: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def update(self, data): self._is_eval_ = data != RunPhase.TRAIN if not self._is_eval_ and self._use_model_picker: if self._eval_stats_dict_['chkpnt_name'] is None: self._eval_stats_dict_[ 'chkpnt_name'] = self._checkpoint_state_.read().name self._eval_trials_ = 0 mean_pct = statistics.mean(self._current_eval_pct_list_ if \ self._current_eval_pct_list_ else [0.0]) LOGGER.info( 'Number of evaluations: {} Evaluation progresses: {}'.format( len(self._current_eval_pct_list_), self._current_eval_pct_list_)) LOGGER.info('Evaluation progresses mean: {}'.format(mean_pct)) self._current_eval_pct_list_.clear() time_stamp = self._current_sim_time if mean_pct >= self._eval_stats_dict_['avg_comp_pct']: LOGGER.info('Current mean: {} >= Current best mean: {}'.format( mean_pct, self._eval_stats_dict_['avg_comp_pct'])) LOGGER.info( 'Updating the best checkpoint to "{}" from "{}".'.format( self._eval_stats_dict_['chkpnt_name'], self._best_chkpnt_stats['name'])) self._eval_stats_dict_['avg_comp_pct'] = mean_pct self._best_chkpnt_stats = { 'name': self._eval_stats_dict_['chkpnt_name'], 'avg_comp_pct': mean_pct, 'time_stamp': time_stamp } last_chkpnt_stats = { 'name': self._eval_stats_dict_['chkpnt_name'], 'avg_comp_pct': mean_pct, 'time_stamp': time_stamp } self._deepracer_checkpoint_json.persist( body=json.dumps({ BEST_CHECKPOINT: self._best_chkpnt_stats, LAST_CHECKPOINT: last_chkpnt_stats }), s3_kms_extra_args=get_s3_kms_extra_args()) # Update the checkpoint name to the new checkpoint being used for training that will # then be evaluated, note this class gets notfied when the system is put into a # training phase and assumes that a training phase only starts when a new check point # is avaialble self._eval_stats_dict_[ 'chkpnt_name'] = self._checkpoint_state_.read().name
def exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx): """Helper method that shutsdown the sim app if the trainer is done checkpoint_dir - direcotry where the done file would be downloaded to """ if should_stop(checkpoint_dir): is_save_mp4_enabled = rospy.get_param("MP4_S3_BUCKET", None) and rollout_idx == 0 if is_save_mp4_enabled: unsubscribe_from_save_mp4 = ServiceProxyWrapper( "/racecar/save_mp4/unsubscribe_from_save_mp4", Empty ) unsubscribe_from_save_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) logger.info("Received termination signal from trainer. Goodbye.") simapp_exit_gracefully()
def upload_episode_metrics(self): json_metrics = json.dumps({ 'metrics': self._metrics_, 'version': METRICS_VERSION, 'best_model_metric': self._best_model_metric_type.value }) self._s3_metrics.persist(body=json_metrics, s3_kms_extra_args=get_s3_kms_extra_args()) if self._is_eval_: if self._best_model_metric_type == BestModelMetricType.REWARD: self._current_eval_best_model_metric_list_.append( self._episode_reward_) else: self._current_eval_best_model_metric_list_.append( self._progress_)
def save_to_store(self): try: # remove lock file if it exists self.syncfile_lock.delete() # acquire lock self.syncfile_lock.persist( s3_kms_extra_args=get_s3_kms_extra_args()) for _, checkpoint in self.params.checkpoint_dict.items(): # upload tensorflow models, tensorflow frozen graph, and rl coach checkpoint self._save_tf_model_to_store(checkpoint) # release lock by delete it self.syncfile_lock.delete() except botocore.exceptions.ClientError: log_and_exit("Unable to upload checkpoint", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as ex: log_and_exit("Exception in uploading checkpoint: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def main(): """ Main function for tournament""" try: # parse argument s3_region = sys.argv[1] s3_bucket = sys.argv[2] s3_prefix = sys.argv[3] s3_yaml_name = sys.argv[4] # create boto3 session/client and download yaml/json file session = boto3.session.Session() s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None) s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url, config=get_boto_config()) yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) local_yaml_path = os.path.abspath( os.path.join(os.getcwd(), s3_yaml_name)) try: s3_client.download_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) except Exception as e: log_and_exit( "Failed to download yaml file: s3_bucket: {}, yaml_key: {}, {}" .format(s3_bucket, yaml_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) # Intermediate tournament files queue_pickle_name = 'tournament_candidate_queue.pkl' queue_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, queue_pickle_name)) local_queue_pickle_path = os.path.abspath( os.path.join(os.getcwd(), queue_pickle_name)) report_pickle_name = 'tournament_report.pkl' report_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, report_pickle_name)) local_report_pickle_path = os.path.abspath( os.path.join(os.getcwd(), report_pickle_name)) final_report_name = 'tournament_report.json' final_report_s3_key = os.path.normpath( os.path.join(s3_prefix, final_report_name)) try: s3_client.download_file(Bucket=s3_bucket, Key=queue_pickle_s3_key, Filename=local_queue_pickle_path) s3_client.download_file(Bucket=s3_bucket, Key=report_pickle_s3_key, Filename=local_report_pickle_path) except: pass # Get values passed in yaml files. Default values are for backward compatibility and for single racecar racing yaml_dict = get_yaml_dict(local_yaml_path) # Forcing the yaml parameter to list # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME after cloud pushes this YAML parameter force_list_params = [ MODEL_S3_BUCKET_YAML_KEY, MODEL_S3_PREFIX_YAML_KEY, MODEL_METADATA_FILE_S3_YAML_KEY, METRICS_S3_BUCKET_YAML_KEY, METRICS_S3_PREFIX_YAML_KEY, SIMTRACE_S3_BUCKET_YAML_KEY, SIMTRACE_S3_PREFIX_YAML_KEY, MP4_S3_BUCKET_YAML_KEY, MP4_S3_PREFIX_YAML_KEY, DISPLAY_NAME_YAML_KEY, RACER_NAME_YAML_KEY ] for params in force_list_params: yaml_dict[params] = force_list(yaml_dict.get(params, None)) # Populate the model_metadata_s3_key values to handle both training and evaluation for all race_formats if None in yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY]: # MODEL_METADATA_FILE_S3_KEY not passed as part of yaml file ==> This happens during evaluation # Assume model_metadata.json is present in the s3_prefix/model/ folder yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY] = list() for s3_prefix in yaml_dict[MODEL_S3_PREFIX_YAML_KEY]: yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY].append( os.path.join(s3_prefix, 'model/model_metadata.json')) # Validate the yaml values validate_yaml_values(yaml_dict) if os.path.exists(local_queue_pickle_path): with open(local_queue_pickle_path, 'rb') as f: tournament_candidate_queue = pickle.load(f) with open(local_report_pickle_path, 'rb') as f: tournament_report = pickle.load(f) logger.info('tournament_candidate_queue loaded from existing file') else: logger.info('tournament_candidate_queue initialized') tournament_candidate_queue = deque() for agent_idx, _ in enumerate(yaml_dict[MODEL_S3_BUCKET_YAML_KEY]): tournament_candidate_queue.append(( yaml_dict[MODEL_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[MODEL_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY][agent_idx], yaml_dict[METRICS_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[METRICS_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[SIMTRACE_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[SIMTRACE_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[MP4_S3_BUCKET_YAML_KEY][agent_idx], yaml_dict[MP4_S3_PREFIX_YAML_KEY][agent_idx], yaml_dict[DISPLAY_NAME_YAML_KEY][agent_idx], # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check "" if None in yaml_dict[RACER_NAME_YAML_KEY] else yaml_dict[RACER_NAME_YAML_KEY][agent_idx])) tournament_report = {"race_results": []} race_idx = len(tournament_report["race_results"]) while len(tournament_candidate_queue) > 1: car1 = tournament_candidate_queue.popleft() car2 = tournament_candidate_queue.popleft() (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata, car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket, car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix, car1_display_name, car1_racer_name) = car1 (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata, car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket, car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix, car2_display_name, car2_racer_name) = car2 race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict, car1=car1, car2=car2, race_idx=race_idx) race_car_colors = RACE_CAR_COLORS race_model_s3_buckets = [ car1_model_s3_bucket, car2_model_s3_bucket ] race_model_metadatas = [car1_model_metadata, car2_model_metadata] # List of directories created dirs_to_delete = list() yaml_dir = os.path.abspath(os.path.join(os.getcwd(), str(race_idx))) os.makedirs(yaml_dir) dirs_to_delete.append(yaml_dir) race_yaml_path = os.path.abspath( os.path.join(yaml_dir, 'evaluation_params.yaml')) with open(race_yaml_path, 'w') as race_yaml_file: yaml.dump(race_yaml_dict, race_yaml_file) # List of racecar names that should include second camera while launching racecars_with_stereo_cameras = list() # List of racecar names that should include lidar while launching racecars_with_lidars = list() # List of SimApp versions simapp_versions = list() for agent_index, model_s3_bucket in enumerate( race_model_s3_buckets): racecar_name = 'racecar_' + str(agent_index) # Make a local folder with the racecar name to download the model_metadata.json os.makedirs(os.path.join(os.getcwd(), racecar_name)) dirs_to_delete.append(os.path.join(os.getcwd(), racecar_name)) local_model_metadata_path = os.path.abspath( os.path.join(os.path.join(os.getcwd(), racecar_name), 'model_metadata.json')) json_key = race_model_metadatas[agent_index] json_key = json_key.replace('s3://{}/'.format(model_s3_bucket), '') try: s3_client.download_file(Bucket=model_s3_bucket, Key=json_key, Filename=local_model_metadata_path) except Exception as e: log_and_exit( "Failed to download model_metadata file: s3_bucket: {}, yaml_key: {}, {}" .format(model_s3_bucket, json_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) sensors, _, simapp_version = utils_parse_model_metadata.parse_model_metadata( local_model_metadata_path) simapp_versions.append(simapp_version) if Input.STEREO.value in sensors: racecars_with_stereo_cameras.append(racecar_name) if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors: racecars_with_lidars.append(racecar_name) cmd = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "tournament_race_node.py"), str(race_idx), race_yaml_path, ','.join(racecars_with_stereo_cameras), ','.join(racecars_with_lidars), ','.join(race_car_colors), ','.join(simapp_versions) ] try: return_code, _, stderr = run_cmd(cmd_args=cmd, shell=False, stdout=None, stderr=None) except KeyboardInterrupt: logger.info( "KeyboardInterrupt raised, SimApp must be faulted! exiting..." ) return # Retrieve winner and append tournament report with open('race_report.pkl', 'rb') as f: race_report = pickle.load(f) race_report['race_idx'] = race_idx winner = car1 if race_report[ 'winner'] == car1_display_name else car2 logger.info("race {}'s winner: {}".format(race_idx, race_report['winner'])) tournament_candidate_queue.append(winner) tournament_report["race_results"].append(race_report) # Clean up directories created for dir_to_delete in dirs_to_delete: shutil.rmtree(dir_to_delete, ignore_errors=True) race_idx += 1 s3_extra_args = get_s3_kms_extra_args() # Persist latest queue and report to use after job restarts. with open(local_queue_pickle_path, 'wb') as f: pickle.dump(tournament_candidate_queue, f, protocol=2) s3_client.upload_file(Filename=local_queue_pickle_path, Bucket=s3_bucket, Key=queue_pickle_s3_key, ExtraArgs=s3_extra_args) with open(local_report_pickle_path, 'wb') as f: pickle.dump(tournament_report, f, protocol=2) s3_client.upload_file(Filename=local_report_pickle_path, Bucket=s3_bucket, Key=report_pickle_s3_key, ExtraArgs=s3_extra_args) # If there is more than 1 candidates then restart the simulation job otherwise # tournament is finished, persists final report and ends the job. if len(tournament_candidate_queue) > 1: restart_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) break else: # Persist final tournament report in json format # and terminate the job by canceling it s3_client.put_object(Bucket=s3_bucket, Key=final_report_s3_key, Body=json.dumps(tournament_report), **s3_extra_args) cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) except Exception as e: log_and_exit("Tournament node failed: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument('-pk', '--preset_s3_key', help="(string) Name of a preset to download from S3", type=str, required=False) parser.add_argument( '-ek', '--environment_s3_key', help="(string) Name of an environment file to download from S3", type=str, required=False) parser.add_argument('--model_metadata_s3_key', help="(string) Model Metadata File S3 Key", type=str, required=False) parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument( '--pretrained_checkpoint_dir', help='(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get( "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("AWS_REGION", "us-east-1")) args, _ = parser.parse_known_args() s3_client = S3Client(region_name=args.aws_region, max_retry_attempts=0) # download model metadata # TODO: replace 'agent' with name of each agent model_metadata_download = ModelMetadata( bucket=args.s3_bucket, s3_key=args.model_metadata_s3_key, region_name=args.aws_region, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) model_metadata_info = model_metadata_download.get_model_metadata_info() network_type = model_metadata_info[ModelMetadataKeys.NEURAL_NETWORK.value] version = model_metadata_info[ModelMetadataKeys.VERSION.value] # upload model metadata model_metadata_upload = ModelMetadata( bucket=args.s3_bucket, s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX), region_name=args.aws_region, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) model_metadata_upload.persist( s3_kms_extra_args=utils.get_s3_kms_extra_args()) shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR) success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" try: s3_client.download_file(bucket=args.s3_bucket, s3_key=args.preset_s3_key, local_path=preset_local_path) success_custom_preset = True except botocore.exceptions.ClientError: pass if not success_custom_preset: logger.info( "Could not download the preset file. Using the default DeepRacer preset." ) else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) s3_client.upload_file( bucket=args.s3_bucket, s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path, s3_kms_extra_args=utils.get_s3_kms_extra_args()) if success_custom_preset: logger.info("Using preset: %s" % args.preset_s3_key) if not success_custom_preset: params_blob = os.environ.get('SM_TRAINING_ENV', '') if params_blob: params = json.loads(params_blob) sm_hyperparams_dict = params["hyperparameters"] else: sm_hyperparams_dict = {} #! TODO each agent should have own config agent_config = { 'model_metadata': model_metadata_download, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: [], ConfigParams.VELOCITY_LIST.value: {}, ConfigParams.STEERING_LIST.value: {}, ConfigParams.CHANGE_START.value: None, ConfigParams.ALT_DIR.value: None, ConfigParams.MODEL_METADATA.value: model_metadata_download, ConfigParams.REWARD.value: None, ConfigParams.AGENT_NAME.value: 'racecar' } } agent_list = list() agent_list.append(create_training_agent(agent_config)) graph_manager, robomaker_hyperparams_json = get_graph_manager( hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=None, run_type=str(RunType.TRAINER)) # Upload hyperparameters to SageMaker shared s3 bucket hyperparameters = Hyperparameters(bucket=args.s3_bucket, s3_key=get_s3_key( args.s3_prefix, HYPERPARAMETER_S3_POSTFIX), region_name=args.aws_region) hyperparameters.persist( hyperparams_json=robomaker_hyperparams_json, s3_kms_extra_args=utils.get_s3_kms_extra_args()) # Attach sample collector to graph_manager only if sample count > 0 max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0)) if max_sample_count > 0: sample_collector = SampleCollector( bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, max_sample_count=max_sample_count, sampling_frequency=int( sm_hyperparams_dict.get("sampling_frequency", 1))) graph_manager.sample_collector = sample_collector # persist IP config from sagemaker to s3 ip_config = IpConfig(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region) ip_config.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args()) training_algorithm = model_metadata_download.training_algorithm output_head_format = FROZEN_HEAD_OUTPUT_GRAPH_FORMAT_MAPPING[ training_algorithm] use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix # Handle backward compatibility if use_pretrained_model: # checkpoint s3 instance for pretrained model # TODO: replace 'agent' for multiagent training checkpoint = Checkpoint(bucket=args.pretrained_s3_bucket, s3_prefix=args.pretrained_s3_prefix, region_name=args.aws_region, agent_name='agent', checkpoint_dir=args.pretrained_checkpoint_dir, output_head_format=output_head_format) # make coach checkpoint compatible if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible( ): checkpoint.rl_coach_checkpoint.make_compatible( checkpoint.syncfile_ready) # get best model checkpoint string model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint( ) # Select the best checkpoint model by uploading rl coach .coach_checkpoint file checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()) # add checkpoint into checkpoint_dict checkpoint_dict = {'agent': checkpoint} # load pretrained model ds_params_instance_pretrained = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained, graph_manager, True) data_store_pretrained.load_from_store() memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters( redis_address="localhost", redis_port=6379, run_type=str(RunType.TRAINER), channel=args.s3_prefix, network_type=network_type) graph_manager.memory_backend_params = memory_backend_params # checkpoint s3 instance for training model checkpoint = Checkpoint(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, agent_name='agent', checkpoint_dir=args.checkpoint_dir, output_head_format=output_head_format) checkpoint_dict = {'agent': checkpoint} ds_params_instance = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) graph_manager.data_store_params = ds_params_instance graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) task_parameters = TaskParameters() task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR task_parameters.checkpoint_save_secs = 20 if use_pretrained_model: task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir task_parameters.checkpoint_save_dir = args.checkpoint_dir training_worker( graph_manager=graph_manager, task_parameters=task_parameters, user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"], user_episode_per_rollout=json.loads( robomaker_hyperparams_json)["num_episodes_between_training"], training_algorithm=training_algorithm)
def upload_finished_file(self): for _, checkpoint in self.params.checkpoint_dict.items(): checkpoint.syncfile_finished.persist( s3_kms_extra_args=get_s3_kms_extra_args())
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, simtrace_video_s3_writers, pause_physics, unpause_physics): """ wait for first checkpoint then perform rollouts using the model """ if not graph_manager.data_store: raise AttributeError("None type for data_store object") data_store = graph_manager.data_store #TODO change agent to specific agent name for multip agent case checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path, "agent") graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.wait_for_trainer_ready() # wait for the required cancel services to become available rospy.wait_for_service('/robomaker/job/cancel') # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4') rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4') subscribe_to_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/subscribe_to_save_mp4', Empty) unsubscribe_from_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) chkpt_state_reader = CheckpointStateReader(checkpoint_dir, checkpoint_state_optional=False) last_checkpoint = chkpt_state_reader.get_latest().num # this worker should play a fraction of the total playing steps per rollout episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps act_steps = int(episode_steps_per_rollout / num_workers) if rollout_idx < episode_steps_per_rollout % num_workers: act_steps += 1 act_steps = EnvironmentEpisodes(act_steps) configure_environment_randomizer() for _ in range( (graph_manager.improve_steps / act_steps.num_steps).num_steps): # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): graph_manager.phase = RunPhase.TRAIN exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) graph_manager.act(act_steps, wait_for_full_episodes=graph_manager. agent_params.algorithm.act_for_full_episodes) graph_manager.reset_internal_state(True) time.sleep(1) pause_physics(EmptyRequest()) graph_manager.phase = RunPhase.UNDEFINED new_checkpoint = -1 if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.SYNC: unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param( 'MP4_S3_BUCKET', None) and rollout_idx == 0 if is_save_mp4_enabled: subscribe_to_save_mp4(EmptyRequest()) if rollout_idx == 0: for _ in range(MIN_EVAL_TRIALS): graph_manager.evaluate(EnvironmentSteps(1)) while new_checkpoint < last_checkpoint + 1: exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx) if rollout_idx == 0: graph_manager.evaluate(EnvironmentSteps(1)) new_checkpoint = data_store.get_coach_checkpoint_number( 'agent') if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) pause_physics(EmptyRequest()) data_store.load_from_store( expected_checkpoint_number=last_checkpoint + 1) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.ASYNC: if new_checkpoint > last_checkpoint: graph_manager.restore_checkpoint() last_checkpoint = new_checkpoint
def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous, park_positions): """ Evaluation worker function Arguments: graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager number_of_trials(int): Number of trails you want to run the evaluation task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu, framework etc of rlcoach simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4 is_continuous(bool): The termination condition for the car park_positions(list of tuple): list of (x, y) for cars to park at """ # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list( ), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append( "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append( "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.modify_checkpoint_variables() # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): rospy.wait_for_service(mp4_sub) rospy.wait_for_service(mp4_unsub) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty)) unsubscribe_from_save_mp4.append( ServiceProxyWrapper(mp4_unsub, Empty)) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) logger.info( "Graph manager successfully created the graph: Unpausing physics") unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) if is_save_mp4_enabled: for subscribe_mp4 in subscribe_to_save_mp4: subscribe_mp4(EmptyRequest()) configure_environment_randomizer() track_data = TrackData.get_instance() # Before each evaluation episode (single lap for non-continuous race and complete race for # continuous race), a new copy of park_positions needs to be loaded into track_data because # a park position will be pop from park_positions when a racer car need to be parked. if is_continuous: track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(number_of_trials): track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) if is_save_mp4_enabled: for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) time.sleep(1) pause_physics(EmptyRequest()) # Close the down the job utils.cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), rospy.get_param('AWS_REGION'))
def main(): """ Main function for evaluation worker """ parser = argparse.ArgumentParser() parser.add_argument('-p', '--preset', help="(string) Name of a preset to run \ (class name from the 'presets' directory.)", type=str, required=False) parser.add_argument('--s3_bucket', help='list(string) S3 bucket', type=str, nargs='+', default=rospy.get_param("MODEL_S3_BUCKET", ["gsaur-test"])) parser.add_argument('--s3_prefix', help='list(string) S3 prefix', type=str, nargs='+', default=rospy.get_param("MODEL_S3_PREFIX", ["sagemaker"])) parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=rospy.get_param("AWS_REGION", "us-east-1")) parser.add_argument('--number_of_trials', help='(integer) Number of trials', type=int, default=int(rospy.get_param("NUMBER_OF_TRIALS", 10))) parser.add_argument( '-c', '--local_model_directory', help='(string) Path to a folder containing a checkpoint \ to restore the model from.', type=str, default='./checkpoint') parser.add_argument('--number_of_resets', help='(integer) Number of resets', type=int, default=int(rospy.get_param("NUMBER_OF_RESETS", 0))) parser.add_argument('--penalty_seconds', help='(float) penalty second', type=float, default=float(rospy.get_param("PENALTY_SECONDS", 2.0))) parser.add_argument('--job_type', help='(string) job type', type=str, default=rospy.get_param("JOB_TYPE", "EVALUATION")) parser.add_argument('--is_continuous', help='(boolean) is continous after lap completion', type=bool, default=utils.str2bool( rospy.get_param("IS_CONTINUOUS", False))) parser.add_argument('--race_type', help='(string) Race type', type=str, default=rospy.get_param("RACE_TYPE", "TIME_TRIAL")) parser.add_argument('--off_track_penalty', help='(float) off track penalty second', type=float, default=float(rospy.get_param("OFF_TRACK_PENALTY", 2.0))) parser.add_argument('--collision_penalty', help='(float) collision penalty second', type=float, default=float(rospy.get_param("COLLISION_PENALTY", 5.0))) args = parser.parse_args() arg_s3_bucket = args.s3_bucket arg_s3_prefix = args.s3_prefix logger.info("S3 bucket: %s \n S3 prefix: %s", arg_s3_bucket, arg_s3_prefix) metrics_s3_buckets = rospy.get_param('METRICS_S3_BUCKET') metrics_s3_object_keys = rospy.get_param('METRICS_S3_OBJECT_KEY') arg_s3_bucket, arg_s3_prefix = utils.force_list( arg_s3_bucket), utils.force_list(arg_s3_prefix) metrics_s3_buckets = utils.force_list(metrics_s3_buckets) metrics_s3_object_keys = utils.force_list(metrics_s3_object_keys) validate_list = [ arg_s3_bucket, arg_s3_prefix, metrics_s3_buckets, metrics_s3_object_keys ] simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None) mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None) if simtrace_s3_bucket: simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX') simtrace_s3_bucket = utils.force_list(simtrace_s3_bucket) simtrace_s3_object_prefix = utils.force_list(simtrace_s3_object_prefix) validate_list.extend([simtrace_s3_bucket, simtrace_s3_object_prefix]) if mp4_s3_bucket: mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX') mp4_s3_bucket = utils.force_list(mp4_s3_bucket) mp4_s3_object_prefix = utils.force_list(mp4_s3_object_prefix) validate_list.extend([mp4_s3_bucket, mp4_s3_object_prefix]) if not all([lambda x: len(x) == len(validate_list[0]), validate_list]): log_and_exit( "Eval worker error: Incorrect arguments passed: {}".format( validate_list), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) if args.number_of_resets != 0 and args.number_of_resets < MIN_RESET_COUNT: raise GenericRolloutException( "number of resets is less than {}".format(MIN_RESET_COUNT)) # Instantiate Cameras if len(arg_s3_bucket) == 1: configure_camera(namespaces=['racecar']) else: configure_camera(namespaces=[ 'racecar_{}'.format(str(agent_index)) for agent_index in range(len(arg_s3_bucket)) ]) agent_list = list() s3_bucket_dict = dict() s3_prefix_dict = dict() checkpoint_dict = dict() simtrace_video_s3_writers = [] start_positions = get_start_positions(len(arg_s3_bucket)) done_condition = utils.str_to_done_condition( rospy.get_param("DONE_CONDITION", any)) park_positions = utils.pos_2d_str_to_list( rospy.get_param("PARK_POSITIONS", [])) # if not pass in park positions for all done condition case, use default if not park_positions: park_positions = [DEFAULT_PARK_POSITION for _ in arg_s3_bucket] for agent_index, _ in enumerate(arg_s3_bucket): agent_name = 'agent' if len(arg_s3_bucket) == 1 else 'agent_{}'.format( str(agent_index)) racecar_name = 'racecar' if len( arg_s3_bucket) == 1 else 'racecar_{}'.format(str(agent_index)) s3_bucket_dict[agent_name] = arg_s3_bucket[agent_index] s3_prefix_dict[agent_name] = arg_s3_prefix[agent_index] # download model metadata model_metadata = ModelMetadata( bucket=arg_s3_bucket[agent_index], s3_key=get_s3_key(arg_s3_prefix[agent_index], MODEL_METADATA_S3_POSTFIX), region_name=args.aws_region, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(agent_name)) model_metadata_info = model_metadata.get_model_metadata_info() version = model_metadata_info[ModelMetadataKeys.VERSION.value] # checkpoint s3 instance checkpoint = Checkpoint(bucket=arg_s3_bucket[agent_index], s3_prefix=arg_s3_prefix[agent_index], region_name=args.aws_region, agent_name=agent_name, checkpoint_dir=args.local_model_directory) # make coach checkpoint compatible if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible( ): checkpoint.rl_coach_checkpoint.make_compatible( checkpoint.syncfile_ready) # get best model checkpoint string model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint( ) # Select the best checkpoint model by uploading rl coach .coach_checkpoint file checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()) checkpoint_dict[agent_name] = checkpoint agent_config = { 'model_metadata': model_metadata, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: [ link_name.replace('racecar', racecar_name) for link_name in LINK_NAMES ], ConfigParams.VELOCITY_LIST.value: [ velocity_topic.replace('racecar', racecar_name) for velocity_topic in VELOCITY_TOPICS ], ConfigParams.STEERING_LIST.value: [ steering_topic.replace('racecar', racecar_name) for steering_topic in STEERING_TOPICS ], ConfigParams.CHANGE_START.value: utils.str2bool(rospy.get_param('CHANGE_START_POSITION', False)), ConfigParams.ALT_DIR.value: utils.str2bool( rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)), ConfigParams.MODEL_METADATA.value: model_metadata, ConfigParams.REWARD.value: reward_function, ConfigParams.AGENT_NAME.value: racecar_name, ConfigParams.VERSION.value: version, ConfigParams.NUMBER_OF_RESETS.value: args.number_of_resets, ConfigParams.PENALTY_SECONDS.value: args.penalty_seconds, ConfigParams.NUMBER_OF_TRIALS.value: args.number_of_trials, ConfigParams.IS_CONTINUOUS.value: args.is_continuous, ConfigParams.RACE_TYPE.value: args.race_type, ConfigParams.COLLISION_PENALTY.value: args.collision_penalty, ConfigParams.OFF_TRACK_PENALTY.value: args.off_track_penalty, ConfigParams.START_POSITION.value: start_positions[agent_index], ConfigParams.DONE_CONDITION.value: done_condition } } metrics_s3_config = { MetricsS3Keys.METRICS_BUCKET.value: metrics_s3_buckets[agent_index], MetricsS3Keys.METRICS_KEY.value: metrics_s3_object_keys[agent_index], # Replaced rospy.get_param('AWS_REGION') to be equal to the argument being passed # or default argument set MetricsS3Keys.REGION.value: args.aws_region } aws_region = rospy.get_param('AWS_REGION', args.aws_region) if simtrace_s3_bucket: simtrace_video_s3_writers.append( SimtraceVideo( upload_type=SimtraceVideoNames.SIMTRACE_EVAL.value, bucket=simtrace_s3_bucket[agent_index], s3_prefix=simtrace_s3_object_prefix[agent_index], region_name=aws_region, local_path=SIMTRACE_EVAL_LOCAL_PATH_FORMAT.format( agent_name))) if mp4_s3_bucket: simtrace_video_s3_writers.extend([ SimtraceVideo( upload_type=SimtraceVideoNames.PIP.value, bucket=mp4_s3_bucket[agent_index], s3_prefix=mp4_s3_object_prefix[agent_index], region_name=aws_region, local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format( agent_name)), SimtraceVideo( upload_type=SimtraceVideoNames.DEGREE45.value, bucket=mp4_s3_bucket[agent_index], s3_prefix=mp4_s3_object_prefix[agent_index], region_name=aws_region, local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format( agent_name)), SimtraceVideo( upload_type=SimtraceVideoNames.TOPVIEW.value, bucket=mp4_s3_bucket[agent_index], s3_prefix=mp4_s3_object_prefix[agent_index], region_name=aws_region, local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format( agent_name)) ]) run_phase_subject = RunPhaseSubject() agent_list.append( create_rollout_agent( agent_config, EvalMetrics(agent_name, metrics_s3_config, args.is_continuous), run_phase_subject)) agent_list.append(create_obstacles_agent()) agent_list.append(create_bot_cars_agent()) # ROS service to indicate all the robomaker markov packages are ready for consumption signal_robomaker_markov_package_ready() PhaseObserver('/agent/training_phase', run_phase_subject) enable_domain_randomization = utils.str2bool( rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False)) sm_hyperparams_dict = {} # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) graph_manager, _ = get_graph_manager( hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=run_phase_subject, enable_domain_randomization=enable_domain_randomization, done_condition=done_condition, pause_physics=pause_physics, unpause_physics=unpause_physics) ds_params_instance = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) graph_manager.data_store = S3BotoDataStore(params=ds_params_instance, graph_manager=graph_manager, ignore_lock=True) graph_manager.env_params.seed = 0 task_parameters = TaskParameters() task_parameters.checkpoint_restore_path = args.local_model_directory evaluation_worker(graph_manager=graph_manager, number_of_trials=args.number_of_trials, task_parameters=task_parameters, simtrace_video_s3_writers=simtrace_video_s3_writers, is_continuous=args.is_continuous, park_positions=park_positions, race_type=args.race_type, pause_physics=pause_physics, unpause_physics=unpause_physics)
def upload_episode_metrics(self): json_metrics = json.dumps({'metrics': self._metrics_}) self._s3_metrics.persist(body=json_metrics, s3_kms_extra_args=get_s3_kms_extra_args()) if self._is_eval_: self._current_eval_pct_list_.append(self._progress_)
def __init__(self, queue_url, aws_region='us-east-1', race_duration=180, number_of_trials=3, number_of_resets=10000, penalty_seconds=2.0, off_track_penalty=2.0, collision_penalty=5.0, is_continuous=False, race_type="TIME_TRIAL"): # constructor arguments self._model_updater = ModelUpdater.get_instance() self._deepracer_path = rospkg.RosPack().get_path( DeepRacerPackages.DEEPRACER_SIMULATION_ENVIRONMENT) body_shell_path = os.path.join(self._deepracer_path, "meshes", "f1") self._valid_body_shells = \ set(".".join(f.split(".")[:-1]) for f in os.listdir(body_shell_path) if os.path.isfile( os.path.join(body_shell_path, f))) self._valid_body_shells.add(const.BodyShellType.DEFAULT.value) self._valid_car_colors = set(e.value for e in const.CarColorType if "f1" not in e.value) self._num_sectors = int(rospy.get_param("NUM_SECTORS", "3")) self._queue_url = queue_url self._region = aws_region self._number_of_trials = number_of_trials self._number_of_resets = number_of_resets self._penalty_seconds = penalty_seconds self._off_track_penalty = off_track_penalty self._collision_penalty = collision_penalty self._is_continuous = is_continuous self._race_type = race_type self._is_save_simtrace_enabled = False self._is_save_mp4_enabled = False self._is_event_end = False self._done_condition = any self._race_duration = race_duration self._enable_domain_randomization = False # sqs client # The boto client errors out after polling for 1 hour. self._sqs_client = SQSClient(queue_url=self._queue_url, region_name=self._region, max_num_of_msg=MAX_NUM_OF_SQS_MESSAGE, wait_time_sec=SQS_WAIT_TIME_SEC, session=refreshed_session(self._region)) self._s3_client = S3Client(region_name=self._region) # tracking current state information self._track_data = TrackData.get_instance() self._start_lane = self._track_data.center_line # keep track of the racer specific info, e.g. s3 locations, alias, car color etc. self._current_racer = None # keep track of the current race car we are using. It is always "racecar". car_model_state = ModelState() car_model_state.model_name = "racecar" self._current_car_model_state = car_model_state self._last_body_shell_type = None self._last_sensors = None self._racecar_model = AgentModel() # keep track of the current control agent we are using self._current_agent = None # keep track of the current control graph manager self._current_graph_manager = None # Keep track of previous model's name self._prev_model_name = None self._hide_position_idx = 0 self._hide_positions = get_hide_positions(race_car_num=1) self._run_phase_subject = RunPhaseSubject() self._simtrace_video_s3_writers = [] self._local_model_directory = './checkpoint' # virtual event only have single agent, so set agent_name to "agent" self._agent_name = "agent" # camera manager self._camera_manager = CameraManager.get_instance() # setting up virtual event top and follow camera in CameraManager # virtual event configure camera does not need to wait for car to spawm because # follow car camera is not tracking any car initially self._main_cameras, self._sub_camera = configure_camera( namespaces=[VIRTUAL_EVENT], is_wait_for_model=False) self._spawn_cameras() # pop out all cameras after configuration to prevent camera from moving self._camera_manager.pop(namespace=VIRTUAL_EVENT) dummy_metrics_s3_config = { MetricsS3Keys.METRICS_BUCKET.value: "dummy-bucket", MetricsS3Keys.METRICS_KEY.value: "dummy-key", MetricsS3Keys.REGION.value: self._region } self._eval_metrics = EvalMetrics( agent_name=self._agent_name, s3_dict_metrics=dummy_metrics_s3_config, is_continuous=self._is_continuous, pause_time_before_start=PAUSE_TIME_BEFORE_START) # upload a default best sector time for all sectors with time inf for each sector # if there is not best sector time existed in s3 # use the s3 bucket and prefix for yaml file stored as environment variable because # here is SimApp use only. For virtual event there is no s3 bucket and prefix past # through yaml file. All are past through sqs. For simplicity, reuse the yaml s3 bucket # and prefix environment variable. virtual_event_best_sector_time = VirtualEventBestSectorTime( bucket=os.environ.get("YAML_S3_BUCKET", ''), s3_key=get_s3_key(os.environ.get("YAML_S3_PREFIX", ''), SECTOR_TIME_S3_POSTFIX), region_name=os.environ.get("APP_REGION", "us-east-1"), local_path=SECTOR_TIME_LOCAL_PATH) response = virtual_event_best_sector_time.list() # this is used to handle situation such as robomaker job crash, so the next robomaker job # can catch the best sector time left over from crashed job if "Contents" not in response: virtual_event_best_sector_time.persist( body=json.dumps({ SECTOR_X_FORMAT.format(idx + 1): float("inf") for idx in range(self._num_sectors) }), s3_kms_extra_args=utils.get_s3_kms_extra_args()) # ROS service to indicate all the robomaker markov packages are ready for consumption signal_robomaker_markov_package_ready() PhaseObserver('/agent/training_phase', self._run_phase_subject) # setup mp4 services self._setup_mp4_services()
def upload_episode_metrics(self): json_metrics = json.dumps({'metrics': self._metrics_}) self._s3_metrics.persist(body=json_metrics, s3_kms_extra_args=get_s3_kms_extra_args())
def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous, park_positions, race_type, pause_physics, unpause_physics): """ Evaluation worker function Arguments: graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager number_of_trials(int): Number of trails you want to run the evaluation task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu, framework etc of rlcoach simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4 is_continuous(bool): The termination condition for the car park_positions(list of tuple): list of (x, y) for cars to park at race_type (str): race type """ # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list( ), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append( "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append( "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.modify_checkpoint_variables() # wait for the required cancel services to become available if race_type != RaceType.F1.value: # TODO: Since we are not running Grand Prix in RoboMaker, # we are opting out from waiting for RoboMaker's cancel job service # in case of Grand Prix execution. # Otherwise, SimApp will hang as service will never come alive. # # If we don't depend on RoboMaker anymore in the future, # we need to remove below line, or do a better job to figure out # whether we are running on RoboMaker or not to decide whether # we should wait for below service or not. rospy.wait_for_service('/robomaker/job/cancel') # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): rospy.wait_for_service(mp4_sub) rospy.wait_for_service(mp4_unsub) for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic): subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty)) unsubscribe_from_save_mp4.append( Thread(target=ServiceProxyWrapper(mp4_unsub, Empty), args=(EmptyRequest(), ))) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) logger.info( "Graph manager successfully created the graph: Unpausing physics") unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) if is_save_mp4_enabled: for subscribe_mp4 in subscribe_to_save_mp4: subscribe_mp4(EmptyRequest()) configure_environment_randomizer() track_data = TrackData.get_instance() # Before each evaluation episode (single lap for non-continuous race and complete race for # continuous race), a new copy of park_positions needs to be loaded into track_data because # a park position will be pop from park_positions when a racer car need to be parked. if is_continuous: track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) else: for _ in range(number_of_trials): track_data.park_positions = park_positions graph_manager.evaluate(EnvironmentSteps(1)) if is_save_mp4_enabled: for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4.start() for unsubscribe_mp4 in unsubscribe_from_save_mp4: unsubscribe_mp4.join() # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) time.sleep(1) pause_physics(EmptyRequest()) if race_type != RaceType.F1.value: # Close the down the job utils.cancel_simulation_job()
def main(): """ Main function for tournament""" try: # parse argument s3_region = sys.argv[1] s3_bucket = sys.argv[2] s3_prefix = sys.argv[3] s3_yaml_name = sys.argv[4] # create boto3 session/client and download yaml/json file session = boto3.session.Session() s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None) s3_client = S3Client(region_name=s3_region, s3_endpoint_url=s3_endpoint_url) # Intermediate tournament files queue_pickle_name = 'tournament_candidate_queue.pkl' queue_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, queue_pickle_name)) local_queue_pickle_path = os.path.abspath( os.path.join(os.getcwd(), queue_pickle_name)) report_pickle_name = 'tournament_report.pkl' report_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, report_pickle_name)) local_report_pickle_path = os.path.abspath( os.path.join(os.getcwd(), report_pickle_name)) final_report_name = 'tournament_report.json' final_report_s3_key = os.path.normpath( os.path.join(s3_prefix, final_report_name)) try: s3_client.download_file(bucket=s3_bucket, s3_key=queue_pickle_s3_key, local_path=local_queue_pickle_path) s3_client.download_file(bucket=s3_bucket, s3_key=report_pickle_s3_key, local_path=local_report_pickle_path) except: pass # download yaml file yaml_file = YamlFile( agent_type=AgentType.TOURNAMENT.value, bucket=s3_bucket, s3_key=get_s3_key(s3_prefix, s3_yaml_name), region_name=s3_region, s3_endpoint_url=s3_endpoint_url, local_path=YAML_LOCAL_PATH_FORMAT.format(s3_yaml_name)) yaml_dict = yaml_file.get_yaml_values() if os.path.exists(local_queue_pickle_path): with open(local_queue_pickle_path, 'rb') as f: tournament_candidate_queue = pickle.load(f) with open(local_report_pickle_path, 'rb') as f: tournament_report = pickle.load(f) logger.info('tournament_candidate_queue loaded from existing file') else: logger.info('tournament_candidate_queue initialized') tournament_candidate_queue = deque() for agent_idx, _ in enumerate( yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value]): tournament_candidate_queue.append(( yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MODEL_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MODEL_METADATA_FILE_S3_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.METRICS_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.METRICS_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.SIMTRACE_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.SIMTRACE_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MP4_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MP4_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.DISPLAY_NAME_YAML_KEY.value][agent_idx], # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check "" if None in yaml_dict.get(YamlKey.RACER_NAME_YAML_KEY.value, [None]) \ else yaml_dict[YamlKey.RACER_NAME_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.BODY_SHELL_TYPE_YAML_KEY.value][agent_idx] )) tournament_report = {"race_results": []} race_idx = len(tournament_report["race_results"]) while len(tournament_candidate_queue) > 1: car1 = tournament_candidate_queue.popleft() car2 = tournament_candidate_queue.popleft() (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata, car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket, car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix, car1_display_name, car1_racer_name, car1_body_shell_type) = car1 (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata, car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket, car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix, car2_display_name, car2_racer_name, car2_body_shell_type) = car2 race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict, car1=car1, car2=car2, race_idx=race_idx) if s3_endpoint_url is not None: race_yaml_dict["S3_ENDPOINT_URL"] = s3_endpoint_url race_model_s3_buckets = [ car1_model_s3_bucket, car2_model_s3_bucket ] race_model_metadatas = [car1_model_metadata, car2_model_metadata] body_shell_types = [car1_body_shell_type, car2_body_shell_type] # List of directories created dirs_to_delete = list() yaml_dir = os.path.abspath(os.path.join(os.getcwd(), str(race_idx))) os.makedirs(yaml_dir) dirs_to_delete.append(yaml_dir) race_yaml_path = os.path.abspath( os.path.join(yaml_dir, 'evaluation_params.yaml')) with open(race_yaml_path, 'w') as race_yaml_file: yaml.dump(race_yaml_dict, race_yaml_file) # List of racecar names that should include second camera while launching racecars_with_stereo_cameras = list() # List of racecar names that should include lidar while launching racecars_with_lidars = list() # List of SimApp versions simapp_versions = list() for agent_index, model_s3_bucket in enumerate( race_model_s3_buckets): racecar_name = 'racecar_' + str(agent_index) json_key = race_model_metadatas[agent_index] # download model metadata try: model_metadata = ModelMetadata( bucket=model_s3_bucket, s3_key=json_key, region_name=s3_region, s3_endpoint_url=s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format( racecar_name)) dirs_to_delete.append(model_metadata.local_dir) except Exception as e: log_and_exit( "Failed to download model_metadata file: s3_bucket: {}, s3_key: {}, {}" .format(model_s3_bucket, json_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) sensors, _, simapp_version = model_metadata.get_model_metadata_info( ) simapp_versions.append(str(simapp_version)) if Input.STEREO.value in sensors: racecars_with_stereo_cameras.append(racecar_name) if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors: racecars_with_lidars.append(racecar_name) cmd = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "tournament_race_node.py"), str(race_idx), race_yaml_path, ','.join(racecars_with_stereo_cameras), ','.join(racecars_with_lidars), ','.join(simapp_versions), ','.join(body_shell_types) ] try: return_code, _, stderr = run_cmd(cmd_args=cmd, shell=False, stdout=None, stderr=None) except KeyboardInterrupt: logger.info( "KeyboardInterrupt raised, SimApp must be faulted! exiting..." ) return # Retrieve winner and append tournament report with open('race_report.pkl', 'rb') as f: race_report = pickle.load(f) race_report['race_idx'] = race_idx winner = car1 if race_report[ 'winner'] == car1_display_name else car2 logger.info("race {}'s winner: {}".format(race_idx, race_report['winner'])) tournament_candidate_queue.append(winner) tournament_report["race_results"].append(race_report) # Clean up directories created for dir_to_delete in dirs_to_delete: shutil.rmtree(dir_to_delete, ignore_errors=True) race_idx += 1 s3_extra_args = get_s3_kms_extra_args() # Persist latest queue and report to use after job restarts. with open(local_queue_pickle_path, 'wb') as f: pickle.dump(tournament_candidate_queue, f, protocol=2) s3_client.upload_file(bucket=s3_bucket, s3_key=queue_pickle_s3_key, local_path=local_queue_pickle_path, s3_kms_extra_args=s3_extra_args) with open(local_report_pickle_path, 'wb') as f: pickle.dump(tournament_report, f, protocol=2) s3_client.upload_file(bucket=s3_bucket, s3_key=report_pickle_s3_key, local_path=local_report_pickle_path, s3_kms_extra_args=s3_extra_args) # If there is more than 1 candidates then restart the simulation job otherwise # tournament is finished, persists final report and ends the job. if len(tournament_candidate_queue) > 1: restart_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) break else: # Persist final tournament report in json format # and terminate the job by canceling it s3_client.put_object(bucket=s3_bucket, s3_key=final_report_s3_key, body=json.dumps(tournament_report), s3_kms_extra_args=s3_extra_args) cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) except ValueError as ex: log_and_exit("User modified model_metadata.json: {}".format(ex), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as e: log_and_exit("Tournament node failed: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument('-pk', '--preset_s3_key', help="(string) Name of a preset to download from S3", type=str, required=False) parser.add_argument('-ek', '--environment_s3_key', help="(string) Name of an environment file to download from S3", type=str, required=False) parser.add_argument('--model_metadata_s3_key', help="(string) Model Metadata File S3 Key", type=str, required=False) parser.add_argument('-c', '--checkpoint-dir', help='(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument('--pretrained-checkpoint-dir', help='(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--s3_endpoint_url', help='(string) S3 endpoint URL', type=str, default=os.environ.get("S3_ENDPOINT_URL", None)) parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("AWS_REGION", "us-east-1")) args, _ = parser.parse_known_args() logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s", args.s3_bucket, args.s3_prefix, args.s3_endpoint_url) s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) # download model metadata # TODO: replace 'agent' with name of each agent model_metadata_download = ModelMetadata(bucket=args.s3_bucket, s3_key=args.model_metadata_s3_key, region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) _, network_type, version = model_metadata_download.get_model_metadata_info() # upload model metadata model_metadata_upload = ModelMetadata(bucket=args.s3_bucket, s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX), region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) model_metadata_upload.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args()) shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR) success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path) if not success_custom_preset: logger.info("Could not download the preset file. Using the default DeepRacer preset.") else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) success_custom_preset = s3_client.upload_file( s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path) if success_custom_preset: logger.info("Using preset: %s" % args.preset_s3_key) if not success_custom_preset: params_blob = os.environ.get('SM_TRAINING_ENV', '') if params_blob: params = json.loads(params_blob) sm_hyperparams_dict = params["hyperparameters"] else: sm_hyperparams_dict = {} #! TODO each agent should have own config agent_config = {'model_metadata': model_metadata_download, ConfigParams.CAR_CTRL_CONFIG.value: {ConfigParams.LINK_NAME_LIST.value: [], ConfigParams.VELOCITY_LIST.value : {}, ConfigParams.STEERING_LIST.value : {}, ConfigParams.CHANGE_START.value : None, ConfigParams.ALT_DIR.value : None, ConfigParams.ACTION_SPACE_PATH.value : model_metadata_download.local_path, ConfigParams.REWARD.value : None, ConfigParams.AGENT_NAME.value : 'racecar'}} agent_list = list() agent_list.append(create_training_agent(agent_config)) graph_manager, robomaker_hyperparams_json = get_graph_manager(hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=None) # Upload hyperparameters to SageMaker shared s3 bucket hyperparameters = Hyperparameters(bucket=args.s3_bucket, s3_key=get_s3_key(args.s3_prefix, HYPERPARAMETER_S3_POSTFIX), region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) hyperparameters.persist(hyperparams_json=robomaker_hyperparams_json, s3_kms_extra_args=utils.get_s3_kms_extra_args()) # Attach sample collector to graph_manager only if sample count > 0 max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0)) if max_sample_count > 0: sample_collector = SampleCollector(s3_client=s3_client, s3_prefix=args.s3_prefix, max_sample_count=max_sample_count, sampling_frequency=int(sm_hyperparams_dict.get("sampling_frequency", 1))) graph_manager.sample_collector = sample_collector host_ip_address = utils.get_ip_from_host() s3_client.write_ip_config(host_ip_address) logger.info("Uploaded IP address information to S3: %s" % host_ip_address) use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix # Handle backward compatibility if use_pretrained_model: if version < SIMAPP_VERSION_2 and \ not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, args.s3_endpoint_url): utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, SyncFiles.TRAINER_READY.value) #Select the optimal model for the starting weights utils.do_model_selection(s3_bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region, bucket_names={'agent':args.pretrained_s3_bucket}, base_checkpoint_dir=args.pretrained_checkpoint_dir, s3_folders={'agent':args.pretrained_s3_prefix}, s3_endpoint_url=args.s3_endpoint_url) data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained, graph_manager, True) data_store_pretrained.load_from_store() memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(redis_address="localhost", redis_port=6379, run_type=str(RunType.TRAINER), channel=args.s3_prefix, network_type=network_type) graph_manager.memory_backend_params = memory_backend_params ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region, bucket_names={'agent':args.s3_bucket}, base_checkpoint_dir=args.checkpoint_dir, s3_folders={'agent':args.s3_prefix}, s3_endpoint_url=args.s3_endpoint_url) graph_manager.data_store_params = ds_params_instance graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) task_parameters = TaskParameters() task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR task_parameters.checkpoint_save_secs = 20 if use_pretrained_model: task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir task_parameters.checkpoint_save_dir = args.checkpoint_dir training_worker( graph_manager=graph_manager, task_parameters=task_parameters, user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"], user_episode_per_rollout=json.loads(robomaker_hyperparams_json)["num_episodes_between_training"] )