def __init__(self, bucket, s3_prefix, region_name='us-east-1', s3_endpoint_url=None, local_dir='./checkpoint/agent', max_retry_attempts=5, backoff_time_sec=1.0): '''This class is for tensorflow model upload and download Args: bucket (str): S3 bucket string s3_prefix (str): S3 prefix string region_name (str): S3 region name local_dir (str): local file directory max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' if not bucket or not s3_prefix: log_and_exit( "checkpoint S3 prefix or bucket not available for S3. \ bucket: {}, prefix {}".format(bucket, s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket self._local_dir = os.path.normpath( CHECKPOINT_LOCAL_DIR_FORMAT.format(local_dir)) self._s3_key_dir = os.path.normpath( os.path.join(s3_prefix, CHECKPOINT_POSTFIX_DIR)) self._delete_queue = queue.Queue() self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def __init__(self, bucket, s3_prefix, region_name='us-east-1', s3_endpoint_url=None, local_path="./custom_files/agent/ip.json", max_retry_attempts=5, backoff_time_sec=1.0): '''ip upload, download, and parse Args: bucket (str): s3 bucket s3_prefix (str): s3 prefix region_name (str): s3 region name local_path (str): ip addres json file local path max_retry_attempts (int): maximum retry attempts backoff_time_sec (float): retry backoff time in seconds ''' if not s3_prefix or not bucket: log_and_exit("Ip config S3 prefix or bucket not available for S3. \ bucket: {}, prefix: {}".format(bucket, s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket self._s3_ip_done_key = os.path.normpath(os.path.join( s3_prefix, IP_DONE_POSTFIX)) self._s3_ip_address_key = os.path.normpath(os.path.join( s3_prefix, IP_ADDRESS_POSTFIX)) self._local_path = local_path self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec) self._ip_file = None
def __init__(self, upload_type, bucket, s3_prefix, region_name="us-east-1", local_path="./custom_files/iteration_data/\ agent/file", s3_endpoint_url=None, max_retry_attempts=5, backoff_time_sec=1.0): '''This class is for all s3 simtrace and video upload Args: upload_type (str): upload simtrace or video type bucket (str): S3 bucket string s3_prefix (str): S3 prefix string region_name (str): S3 region name local_path (str): file local path max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' self._upload_type = upload_type self._bucket = bucket self._s3_key = os.path.normpath( os.path.join(s3_prefix, SIMTRACE_VIDEO_POSTFIX_DICT[self._upload_type])) self._local_path = local_path self._upload_num = 0 self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def __init__(self, bucket, s3_key, region_name="us-east-1", s3_endpoint_url=None, local_path="./custom_files/agent/model_metadata.json", max_retry_attempts=5, backoff_time_sec=1.0): '''Model metadata upload, download, and parse Args: bucket (str): S3 bucket string s3_key: (str): S3 key string. region_name (str): S3 region name local_path (str): file local path max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' # check s3 key and s3 bucket exist if not bucket or not s3_key: log_and_exit("model_metadata S3 key or bucket not available for S3. \ bucket: {}, key {}" .format(bucket, s3_key), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket # Strip the s3://<bucket> from uri, if s3_key past in as uri self._s3_key = s3_key.replace('s3://{}/'.format(self._bucket), '') self._local_path = local_path self._local_dir = os.path.dirname(self._local_path) self._model_metadata = None self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def __init__(self, bucket, s3_prefix, region_name, s3_endpoint_url=None, max_sample_count=None, sampling_frequency=None, max_retry_attempts=5, backoff_time_sec=1.0): '''Sample Collector class to collect sample and persist to S3. Args: bucket (str): S3 bucket string s3_prefix (str): S3 prefix string region_name (str): S3 region name max_sample_count (int): max sample count sampling_frequency (int): sampleing frequency max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' self.max_sample_count = max_sample_count or 0 self.sampling_frequency = sampling_frequency or 1 if self.sampling_frequency < 1: err_msg = "sampling_frequency must be larger or equal to 1. (Given: {})".format( self.sampling_frequency) raise GenericTrainerException(err_msg) self.s3_prefix = s3_prefix self._cur_sample_count = 0 self._cur_frequency = 0 self._bucket = bucket self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def __init__(self, syncfile_type, bucket, s3_prefix, region_name="us-east-1", local_dir='./checkpoint', max_retry_attempts=5, backoff_time_sec=1.0): '''This class is for rl coach sync file: .finished, .lock, and .ready Args: syncfile_type (str): sync file type bucket (str): S3 bucket string s3_prefix (str): S3 prefix string local_dir (str): local file directory checkpoint_dir (str): checkpoint directory max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' if not bucket or not s3_prefix: log_and_exit( "checkpoint S3 prefix or bucket not available for S3. \ bucket: {}, prefix {}".format(bucket, s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._syncfile_type = syncfile_type self._bucket = bucket # deepracer checkpoint json s3 key self._s3_key = os.path.normpath( os.path.join(s3_prefix, SYNC_FILES_POSTFIX_DICT[syncfile_type])) # deepracer checkpoint json local path self._local_path = os.path.normpath( SYNC_FILES_LOCAL_PATH_FORMAT_DICT[syncfile_type].format(local_dir)) self._s3_client = S3Client(region_name, max_retry_attempts, backoff_time_sec)
def __init__(self, bucket, s3_key, region_name='us-east-1', max_retry_attempts=5, backoff_time_sec=1.0, s3_endpoint_url=None): '''metrics upload Args: bucket (str): s3 bucket s3_key (str): s3 key region_name (str): s3 region name max_retry_attempts (int): maximum retry attempts backoff_time_sec (float): retry backoff time in seconds ''' if not s3_key or not bucket: log_and_exit( "Metrics S3 key or bucket not available for S3. \ bucket: {}, key: {}".format(bucket, s3_key), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket self._s3_key = s3_key self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def __init__(self, bucket, s3_prefix, region_name='us-east-1', local_dir='.checkpoint/agent', max_retry_attempts=0, backoff_time_sec=1.0): '''This class is for deepracer checkpoint json file upload and download Args: bucket (str): S3 bucket string s3_prefix (str): S3 prefix string region_name (str): S3 region name local_dir (str): local file directory max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' if not bucket or not s3_prefix: log_and_exit( "checkpoint S3 prefix or bucket not available for S3. \ bucket: {}, prefix {}".format(bucket, s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket # deepracer checkpoint json s3 key self._s3_key = os.path.normpath( os.path.join(s3_prefix, DEEPRACER_CHECKPOINT_KEY_POSTFIX)) # deepracer checkpoint json local path self._local_path = os.path.normpath( DEEPRACER_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) self._s3_client = S3Client(region_name, max_retry_attempts, backoff_time_sec)
def download_custom_files_if_present(s3_bucket, s3_prefix, aws_region): '''download custom environment and preset files Args: s3_bucket (str): s3 bucket string s3_prefix (str): s3 prefix string aws_region (str): aws region string Returns: tuple (bool, bool): tuple of bool on whether preset and environemnt is downloaded successfully ''' success_environment_download, success_preset_download = False, False try: s3_client = S3Client(region_name=aws_region, max_retry_attempts=0) environment_file_s3_key = os.path.normpath( s3_prefix + "/environments/deepracer_racetrack_env.py") environment_local_path = os.path.join(CUSTOM_FILES_PATH, "deepracer_racetrack_env.py") s3_client.download_file(bucket=s3_bucket, s3_key=environment_file_s3_key, local_path=environment_local_path) success_environment_download = True except botocore.exceptions.ClientError: pass try: preset_file_s3_key = os.path.normpath(s3_prefix + "/presets/preset.py") preset_local_path = os.path.join(CUSTOM_FILES_PATH, "preset.py") s3_client.download_file(bucket=s3_bucket, s3_key=preset_file_s3_key, local_path=preset_local_path) success_preset_download = True except botocore.exceptions.ClientError: pass return success_preset_download, success_environment_download
def __init__(self, bucket, s3_key, region_name="us-east-1", local_path="./custom_files/agent/hyperparameters.json", max_retry_attempts=5, backoff_time_sec=1.0): '''Hyperparameters upload, download, and parse Args: bucket (str): S3 bucket string s3_key (str): S3 key string local_path (str): file local path region_name (str): S3 region name max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' # check s3 key and bucket exist for hyperparamer if not s3_key or not bucket: log_and_exit( "hyperparameters S3 key or bucket not available for S3. \ bucket: {}, key: {}".format(bucket, s3_key), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket # Strip the s3://<bucket> from uri, if s3_key past in as uri self._s3_key = s3_key.replace('s3://{}/'.format(self._bucket), '') self._local_path = local_path self._hyperparameters = None self._s3_client = S3Client(region_name, max_retry_attempts, backoff_time_sec)
def __init__(self, agent_type, bucket, s3_key, region_name="us-east-1", s3_endpoint_url=None, local_path="params.yaml", max_retry_attempts=5, backoff_time_sec=1.0): '''yaml upload, download, and parse Args: agent_type (str): rollout for training, evaluation for eval, tournament for tournament bucket (str): S3 bucket string s3_key: (str): S3 key string. region_name (str): S3 region name local_path (str): file local path max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' if not bucket or not s3_key: log_and_exit( "yaml file S3 key or bucket not available for S3. \ bucket: {}, key: {}".format(bucket, s3_key), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket # Strip the s3://<bucket> from uri, if s3_key past in as uri self._s3_key = s3_key.replace('s3://{}/'.format(self._bucket), '') self._local_path = local_path self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec) self._agent_type = agent_type if self._agent_type == AgentType.ROLLOUT.value: self._model_s3_bucket_yaml_key = YamlKey.SAGEMAKER_SHARED_S3_BUCKET_YAML_KEY.value self._model_s3_prefix_yaml_key = YamlKey.SAGEMAKER_SHARED_S3_PREFIX_YAML_KEY.value self._mandatory_yaml_key = TRAINING_MANDATORY_YAML_KEY elif self._agent_type == AgentType.EVALUATION.value: self._model_s3_bucket_yaml_key = YamlKey.MODEL_S3_BUCKET_YAML_KEY.value self._model_s3_prefix_yaml_key = YamlKey.MODEL_S3_PREFIX_YAML_KEY.value self._mandatory_yaml_key = EVAL_MANDATORY_YAML_KEY elif self._agent_type == AgentType.TOURNAMENT.value: self._model_s3_bucket_yaml_key = YamlKey.MODEL_S3_BUCKET_YAML_KEY.value self._model_s3_prefix_yaml_key = YamlKey.MODEL_S3_PREFIX_YAML_KEY.value self._mandatory_yaml_key = TOUR_MANDATORY_YAML_KEY else: log_and_exit( "Unknown agent type in launch file: {}".format( self._agent_type), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._yaml_values = None self._is_multicar = False self._is_f1 = False self._model_s3_buckets = list() self._model_metadata_s3_keys = list() self._body_shell_types = list()
def __init__(self, bucket, s3_prefix, region_name='us-east-1', s3_endpoint_url=None, local_dir='./checkpoint/agent', max_retry_attempts=5, backoff_time_sec=1.0): '''This class is for RL coach checkpoint file Args: bucket (str): S3 bucket string s3_prefix (str): S3 prefix string region_name (str): S3 region name local_dir (str): local file directory max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' if not bucket or not s3_prefix: log_and_exit( "checkpoint S3 prefix or bucket not available for S3. \ bucket: {}, prefix {}".format(bucket, s3_prefix), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket # coach checkpoint s3 key self._s3_key = os.path.normpath( os.path.join(s3_prefix, COACH_CHECKPOINT_POSTFIX)) # coach checkpoint local path self._local_path = os.path.normpath( COACH_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) # coach checkpoint local temp path self._temp_local_path = os.path.normpath( TEMP_COACH_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) # old coach checkpoint s3 key to handle backward compatibility self._old_s3_key = os.path.normpath( os.path.join(s3_prefix, OLD_COACH_CHECKPOINT_POSTFIX)) # old coach checkpoint local path to handle backward compatibility self._old_local_path = os.path.normpath( OLD_COACH_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) # coach checkpoint state file from rl coach self._coach_checkpoint_state_file = CheckpointStateFile( os.path.dirname(self._local_path)) self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def __init__(self, bucket, s3_key, region_name="us-east-1", s3_endpoint_url=None, local_path="./custom_files/agent/customer_reward_function.py", max_retry_attempts=5, backoff_time_sec=1.0): '''reward function upload, download, and parse Args: bucket (str): S3 bucket string s3_key (str): S3 key string region_name (str): S3 region name local_path (str): file local path max_retry_attempts (int): maximum number of retry attempts for S3 download/upload backoff_time_sec (float): backoff second between each retry ''' # check s3 key and bucket exist for reward function if not s3_key or not bucket: log_and_exit( "Reward function code S3 key or bucket not available for S3. \ bucket: {}, key: {}".format(bucket, s3_key), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) self._bucket = bucket # Strip the s3://<bucket> from uri, if s3_key past in as uri self._s3_key = s3_key.replace('s3://{}/'.format(self._bucket), '') self._local_path_processed = local_path # if _local_path_processed is test.py then _local_path_preprocessed is test_preprocessed.py self._local_path_preprocessed = ("_preprocessed.py").join( local_path.split(".py")) # if local _local_path_processed is ./custom_files/agent/customer_reward_function.py, # then the import path should be custom_files.agent.customer_reward_function by # remove ".py", remove "./", and replace "/" and "." self._import_path = local_path.replace(".py", "").replace("./", "").replace( "/", ".") self._reward_function = None self._s3_client = S3Client(region_name, s3_endpoint_url, max_retry_attempts, backoff_time_sec)
def main(): """ Main function for tournament""" try: # parse argument s3_region = sys.argv[1] s3_bucket = sys.argv[2] s3_prefix = sys.argv[3] s3_yaml_name = sys.argv[4] # create boto3 session/client and download yaml/json file session = boto3.session.Session() s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None) s3_client = S3Client(region_name=s3_region, s3_endpoint_url=s3_endpoint_url) # Intermediate tournament files queue_pickle_name = 'tournament_candidate_queue.pkl' queue_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, queue_pickle_name)) local_queue_pickle_path = os.path.abspath( os.path.join(os.getcwd(), queue_pickle_name)) report_pickle_name = 'tournament_report.pkl' report_pickle_s3_key = os.path.normpath( os.path.join(s3_prefix, report_pickle_name)) local_report_pickle_path = os.path.abspath( os.path.join(os.getcwd(), report_pickle_name)) final_report_name = 'tournament_report.json' final_report_s3_key = os.path.normpath( os.path.join(s3_prefix, final_report_name)) try: s3_client.download_file(bucket=s3_bucket, s3_key=queue_pickle_s3_key, local_path=local_queue_pickle_path) s3_client.download_file(bucket=s3_bucket, s3_key=report_pickle_s3_key, local_path=local_report_pickle_path) except: pass # download yaml file yaml_file = YamlFile( agent_type=AgentType.TOURNAMENT.value, bucket=s3_bucket, s3_key=get_s3_key(s3_prefix, s3_yaml_name), region_name=s3_region, s3_endpoint_url=s3_endpoint_url, local_path=YAML_LOCAL_PATH_FORMAT.format(s3_yaml_name)) yaml_dict = yaml_file.get_yaml_values() if os.path.exists(local_queue_pickle_path): with open(local_queue_pickle_path, 'rb') as f: tournament_candidate_queue = pickle.load(f) with open(local_report_pickle_path, 'rb') as f: tournament_report = pickle.load(f) logger.info('tournament_candidate_queue loaded from existing file') else: logger.info('tournament_candidate_queue initialized') tournament_candidate_queue = deque() for agent_idx, _ in enumerate( yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value]): tournament_candidate_queue.append(( yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MODEL_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MODEL_METADATA_FILE_S3_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.METRICS_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.METRICS_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.SIMTRACE_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.SIMTRACE_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MP4_S3_BUCKET_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.MP4_S3_PREFIX_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.DISPLAY_NAME_YAML_KEY.value][agent_idx], # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check "" if None in yaml_dict.get(YamlKey.RACER_NAME_YAML_KEY.value, [None]) \ else yaml_dict[YamlKey.RACER_NAME_YAML_KEY.value][agent_idx], yaml_dict[YamlKey.BODY_SHELL_TYPE_YAML_KEY.value][agent_idx] )) tournament_report = {"race_results": []} race_idx = len(tournament_report["race_results"]) while len(tournament_candidate_queue) > 1: car1 = tournament_candidate_queue.popleft() car2 = tournament_candidate_queue.popleft() (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata, car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket, car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix, car1_display_name, car1_racer_name, car1_body_shell_type) = car1 (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata, car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket, car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix, car2_display_name, car2_racer_name, car2_body_shell_type) = car2 race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict, car1=car1, car2=car2, race_idx=race_idx) if s3_endpoint_url is not None: race_yaml_dict["S3_ENDPOINT_URL"] = s3_endpoint_url race_model_s3_buckets = [ car1_model_s3_bucket, car2_model_s3_bucket ] race_model_metadatas = [car1_model_metadata, car2_model_metadata] body_shell_types = [car1_body_shell_type, car2_body_shell_type] # List of directories created dirs_to_delete = list() yaml_dir = os.path.abspath(os.path.join(os.getcwd(), str(race_idx))) os.makedirs(yaml_dir) dirs_to_delete.append(yaml_dir) race_yaml_path = os.path.abspath( os.path.join(yaml_dir, 'evaluation_params.yaml')) with open(race_yaml_path, 'w') as race_yaml_file: yaml.dump(race_yaml_dict, race_yaml_file) # List of racecar names that should include second camera while launching racecars_with_stereo_cameras = list() # List of racecar names that should include lidar while launching racecars_with_lidars = list() # List of SimApp versions simapp_versions = list() for agent_index, model_s3_bucket in enumerate( race_model_s3_buckets): racecar_name = 'racecar_' + str(agent_index) json_key = race_model_metadatas[agent_index] # download model metadata try: model_metadata = ModelMetadata( bucket=model_s3_bucket, s3_key=json_key, region_name=s3_region, s3_endpoint_url=s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format( racecar_name)) dirs_to_delete.append(model_metadata.local_dir) except Exception as e: log_and_exit( "Failed to download model_metadata file: s3_bucket: {}, s3_key: {}, {}" .format(model_s3_bucket, json_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) sensors, _, simapp_version = model_metadata.get_model_metadata_info( ) simapp_versions.append(str(simapp_version)) if Input.STEREO.value in sensors: racecars_with_stereo_cameras.append(racecar_name) if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors: racecars_with_lidars.append(racecar_name) cmd = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "tournament_race_node.py"), str(race_idx), race_yaml_path, ','.join(racecars_with_stereo_cameras), ','.join(racecars_with_lidars), ','.join(simapp_versions), ','.join(body_shell_types) ] try: return_code, _, stderr = run_cmd(cmd_args=cmd, shell=False, stdout=None, stderr=None) except KeyboardInterrupt: logger.info( "KeyboardInterrupt raised, SimApp must be faulted! exiting..." ) return # Retrieve winner and append tournament report with open('race_report.pkl', 'rb') as f: race_report = pickle.load(f) race_report['race_idx'] = race_idx winner = car1 if race_report[ 'winner'] == car1_display_name else car2 logger.info("race {}'s winner: {}".format(race_idx, race_report['winner'])) tournament_candidate_queue.append(winner) tournament_report["race_results"].append(race_report) # Clean up directories created for dir_to_delete in dirs_to_delete: shutil.rmtree(dir_to_delete, ignore_errors=True) race_idx += 1 s3_extra_args = get_s3_kms_extra_args() # Persist latest queue and report to use after job restarts. with open(local_queue_pickle_path, 'wb') as f: pickle.dump(tournament_candidate_queue, f, protocol=2) s3_client.upload_file(bucket=s3_bucket, s3_key=queue_pickle_s3_key, local_path=local_queue_pickle_path, s3_kms_extra_args=s3_extra_args) with open(local_report_pickle_path, 'wb') as f: pickle.dump(tournament_report, f, protocol=2) s3_client.upload_file(bucket=s3_bucket, s3_key=report_pickle_s3_key, local_path=local_report_pickle_path, s3_kms_extra_args=s3_extra_args) # If there is more than 1 candidates then restart the simulation job otherwise # tournament is finished, persists final report and ends the job. if len(tournament_candidate_queue) > 1: restart_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) break else: # Persist final tournament report in json format # and terminate the job by canceling it s3_client.put_object(bucket=s3_bucket, s3_key=final_report_s3_key, body=json.dumps(tournament_report), s3_kms_extra_args=s3_extra_args) cancel_simulation_job( os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) except ValueError as ex: log_and_exit("User modified model_metadata.json: {}".format(ex), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as e: log_and_exit("Tournament node failed: {}".format(e), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
def main(): screen.set_use_colors(False) parser = argparse.ArgumentParser() parser.add_argument('-pk', '--preset_s3_key', help="(string) Name of a preset to download from S3", type=str, required=False) parser.add_argument( '-ek', '--environment_s3_key', help="(string) Name of an environment file to download from S3", type=str, required=False) parser.add_argument('--model_metadata_s3_key', help="(string) Model Metadata File S3 Key", type=str, required=False) parser.add_argument( '-c', '--checkpoint_dir', help= '(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') parser.add_argument( '--pretrained_checkpoint_dir', help='(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) parser.add_argument('--s3_bucket', help='(string) S3 bucket', type=str, default=os.environ.get( "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test")) parser.add_argument('--s3_prefix', help='(string) S3 prefix', type=str, default='sagemaker') parser.add_argument('--s3_endpoint_url', help='(string) S3 endpoint URL', type=str, default=os.environ.get("S3_ENDPOINT_URL", None)) parser.add_argument('--framework', help='(string) tensorflow or mxnet', type=str, default='tensorflow') parser.add_argument('--pretrained_s3_bucket', help='(string) S3 bucket for pre-trained model', type=str) parser.add_argument('--pretrained_s3_prefix', help='(string) S3 prefix for pre-trained model', type=str, default='sagemaker') parser.add_argument( '--pretrained_checkpoint', help='(string) Choose which checkpoint to use (best | last)', type=str, default="best") parser.add_argument('--aws_region', help='(string) AWS region', type=str, default=os.environ.get("AWS_REGION", "us-east-1")) args, _ = parser.parse_known_args() logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s", args.s3_bucket, args.s3_prefix, args.s3_endpoint_url) s3_client = S3Client(region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, max_retry_attempts=0) # download model metadata # TODO: replace 'agent' with name of each agent model_metadata_download = ModelMetadata( bucket=args.s3_bucket, s3_key=args.model_metadata_s3_key, region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) _, network_type, version = model_metadata_download.get_model_metadata_info( ) # upload model metadata model_metadata_upload = ModelMetadata( bucket=args.s3_bucket, s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX), region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent')) model_metadata_upload.persist( s3_kms_extra_args=utils.get_s3_kms_extra_args()) shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR) success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" try: s3_client.download_file(bucket=args.s3_bucket, s3_key=args.preset_s3_key, local_path=preset_local_path) success_custom_preset = True except botocore.exceptions.ClientError: pass if not success_custom_preset: logger.info( "Could not download the preset file. Using the default DeepRacer preset." ) else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) s3_client.upload_file( bucket=args.s3_bucket, s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path, s3_kms_extra_args=utils.get_s3_kms_extra_args()) if success_custom_preset: logger.info("Using preset: %s" % args.preset_s3_key) if not success_custom_preset: params_blob = os.environ.get('SM_TRAINING_ENV', '') if params_blob: params = json.loads(params_blob) sm_hyperparams_dict = params["hyperparameters"] else: sm_hyperparams_dict = {} #! TODO each agent should have own config agent_config = { 'model_metadata': model_metadata_download, ConfigParams.CAR_CTRL_CONFIG.value: { ConfigParams.LINK_NAME_LIST.value: [], ConfigParams.VELOCITY_LIST.value: {}, ConfigParams.STEERING_LIST.value: {}, ConfigParams.CHANGE_START.value: None, ConfigParams.ALT_DIR.value: None, ConfigParams.ACTION_SPACE_PATH.value: model_metadata_download.local_path, ConfigParams.REWARD.value: None, ConfigParams.AGENT_NAME.value: 'racecar' } } agent_list = list() agent_list.append(create_training_agent(agent_config)) graph_manager, robomaker_hyperparams_json = get_graph_manager( hp_dict=sm_hyperparams_dict, agent_list=agent_list, run_phase_subject=None) # Upload hyperparameters to SageMaker shared s3 bucket hyperparameters = Hyperparameters(bucket=args.s3_bucket, s3_key=get_s3_key( args.s3_prefix, HYPERPARAMETER_S3_POSTFIX), region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) hyperparameters.persist( hyperparams_json=robomaker_hyperparams_json, s3_kms_extra_args=utils.get_s3_kms_extra_args()) # Attach sample collector to graph_manager only if sample count > 0 max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0)) if max_sample_count > 0: sample_collector = SampleCollector( bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, max_sample_count=max_sample_count, sampling_frequency=int( sm_hyperparams_dict.get("sampling_frequency", 1))) graph_manager.sample_collector = sample_collector # persist IP config from sagemaker to s3 ip_config = IpConfig(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) ip_config.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args()) use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix # Handle backward compatibility if use_pretrained_model: # checkpoint s3 instance for pretrained model # TODO: replace 'agent' for multiagent training checkpoint = Checkpoint(bucket=args.pretrained_s3_bucket, s3_prefix=args.pretrained_s3_prefix, region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, agent_name='agent', checkpoint_dir=args.pretrained_checkpoint_dir) # make coach checkpoint compatible if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible( ): checkpoint.rl_coach_checkpoint.make_compatible( checkpoint.syncfile_ready) # Get the correct pre-trained checkpoint if args.pretrained_checkpoint.lower() == "best": # get best model checkpoint string model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint( ) else: # get the last model checkpoint string model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint( ) # Select the best checkpoint model by uploading rl coach .coach_checkpoint file checkpoint.rl_coach_checkpoint.update( model_checkpoint_name=model_checkpoint_name, s3_kms_extra_args=utils.get_s3_kms_extra_args()) # add checkpoint into checkpoint_dict checkpoint_dict = {'agent': checkpoint} # load pretrained model ds_params_instance_pretrained = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained, graph_manager, True) data_store_pretrained.load_from_store() memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters( redis_address="localhost", redis_port=6379, run_type=str(RunType.TRAINER), channel=args.s3_prefix, network_type=network_type) graph_manager.memory_backend_params = memory_backend_params # checkpoint s3 instance for training model checkpoint = Checkpoint(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, agent_name='agent', checkpoint_dir=args.checkpoint_dir) checkpoint_dict = {'agent': checkpoint} ds_params_instance = S3BotoDataStoreParameters( checkpoint_dict=checkpoint_dict) graph_manager.data_store_params = ds_params_instance graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) task_parameters = TaskParameters() task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR task_parameters.checkpoint_save_secs = 20 if use_pretrained_model: task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir task_parameters.checkpoint_save_dir = args.checkpoint_dir training_worker( graph_manager=graph_manager, task_parameters=task_parameters, user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"], user_episode_per_rollout=json.loads( robomaker_hyperparams_json)["num_episodes_between_training"])