def get_args():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained_checkpoint_dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()
    return args
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num_workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=int(rospy.get_param("NUM_WORKERS", 1)))
    parser.add_argument('--rollout_idx',
                        help="(int) The index of current rollout worker",
                        type=int,
                        default=0)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        log_and_exit("Failed to import user's reward_function: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    step_metrics_prefix = rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX')
    if args.num_workers > 1:
        step_metrics_prefix = os.path.join(step_metrics_prefix,
                                           str(args.rollout_idx))
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')
    }
    metrics_s3_model_cfg = {
        MetricsS3Keys.METRICS_BUCKET.value:
        args.s3_bucket,
        MetricsS3Keys.METRICS_KEY.value:
        os.path.join(args.s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX),
        MetricsS3Keys.REGION.value:
        args.aws_region
    }
    run_phase_subject = RunPhaseSubject()

    agent_list = list()
    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics(agent_name='agent',
                            s3_dict_metrics=metrics_s3_config,
                            s3_dict_model=metrics_s3_model_cfg,
                            ckpnt_dir=args.checkpoint_dir,
                            run_phase_sink=run_phase_subject,
                            use_model_picker=(args.rollout_idx == 0)),
            run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET',
                                    None) if args.rollout_idx == 0 else None
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        if args.num_workers > 1:
            simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix,
                                                     str(args.rollout_idx))
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    s3_writer_job_info = []
    if simtrace_s3_bucket:
        s3_writer_job_info.append(
            IterationData(
                'simtrace', simtrace_s3_bucket, simtrace_s3_object_prefix,
                aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.SIM_TRACE_TRAINING_LOCAL_FILE.
                    value)))
    if mp4_s3_bucket:
        s3_writer_job_info.extend([
            IterationData(
                'pip', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                '45degree', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                'topview', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value))
        ])

    s3_writer = S3Writer(job_info=s3_writer_job_info)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s", redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as filepointer:
            sm_hyperparams_dict = json.load(filepointer)
    else:
        logger.info("SageMaker hyperparameters not found.")

    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))
    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject,
            enable_domain_randomization=enable_domain_randomization)

    # If num_episodes_between_training is smaller than num_workers then cancel worker early.
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    # Reduce number of workers if allocated more than num_episodes_between_training
    if args.num_workers > episode_steps_per_rollout:
        logger.info(
            "Excess worker allocated. Reducing from {} to {}...".format(
                args.num_workers, episode_steps_per_rollout))
        args.num_workers = episode_steps_per_rollout
    if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers:
        err_msg_format = "Exiting excess worker..."
        err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])"
        logger.info(
            err_msg_format.format(args.rollout_idx, args.num_workers,
                                  episode_steps_per_rollout))
        # Close the down the job
        utils.cancel_simulation_job(
            os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
            rospy.get_param('AWS_REGION'))

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix,
        num_workers=args.num_workers,
        rollout_idx=args.rollout_idx)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names={'agent': args.s3_bucket},
        base_checkpoint_dir=args.checkpoint_dir,
        s3_folders={'agent': args.s3_prefix})

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   rollout_idx=args.rollout_idx,
                   task_parameters=task_parameters,
                   s3_writer=s3_writer)
Exemple #3
0
def main():
    screen.set_use_colors(False)

    logger.info("src/training_worker.py - INIZIO MAIN")

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--s3_endpoint_url',
                        help='(string) S3 endpoint URL',
                        type=str,
                        default=os.environ.get("S3_ENDPOINT_URL", None))
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()
    logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s",
                args.s3_bucket, args.s3_prefix, args.s3_endpoint_url)

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region,
                             s3_endpoint_url=args.s3_endpoint_url)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)
    s3_client.upload_file(
        os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
        model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(
            s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            logger.info(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path)
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #configurazione agente: metadati del modello impostati dall'utente (angolo di sterzo + velocità) + nome

        #! TODO each agent should have own config
        agent_config = {
            'model_metadata': model_metadata_local_path,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [],
                ConfigParams.VELOCITY_LIST.value: {},
                ConfigParams.STEERING_LIST.value: {},
                ConfigParams.CHANGE_START.value: None,
                ConfigParams.ALT_DIR.value: None,
                ConfigParams.ACTION_SPACE_PATH.value:
                'custom_files/model_metadata.json',
                ConfigParams.REWARD.value: None,
                ConfigParams.AGENT_NAME.value: 'racecar'
            }
        }

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))

        logger.info(
            "src/training_worker.py - ora chiamo la get_graph_manager, che recupera l'agente"
        )

        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=None)

        logger.info("src/training_worker.py - ho l'agente")

        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        logger.info("Uploaded hyperparameters.json to S3")

        # Attach sample collector to graph_manager only if sample count > 0
        max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0))
        if max_sample_count > 0:
            sample_collector = SampleCollector(
                s3_client=s3_client,
                s3_prefix=args.s3_prefix,
                max_sample_count=max_sample_count,
                sampling_frequency=int(
                    sm_hyperparams_dict.get("sampling_frequency", 1)))
            graph_manager.sample_collector = sample_collector

    host_ip_address = utils.get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    logger.info("Uploaded IP address information to S3: %s" % host_ip_address)
    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    # Handle backward compatibility
    _, network_type, version = parse_model_metadata(model_metadata_local_path)
    if use_pretrained_model:
        if float(version) < float(SIMAPP_VERSION) and \
        not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, args.s3_endpoint_url):
            utils.make_compatible(args.pretrained_s3_bucket,
                                  args.pretrained_s3_prefix, args.aws_region,
                                  SyncFiles.TRAINER_READY.value)
        #Select the optimal model for the starting weights
        utils.do_model_selection(s3_bucket=args.s3_bucket,
                                 s3_prefix=args.s3_prefix,
                                 region=args.aws_region,
                                 s3_endpoint_url=args.s3_endpoint_url)

        ds_params_instance_pretrained = S3BotoDataStoreParameters(
            aws_region=args.aws_region,
            bucket_names={'agent': args.pretrained_s3_bucket},
            base_checkpoint_dir=args.pretrained_checkpoint_dir,
            s3_folders={'agent': args.pretrained_s3_prefix},
            s3_endpoint_url=args.s3_endpoint_url)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained,
                                                graph_manager, True)
        data_store_pretrained.load_from_store()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type=str(RunType.TRAINER),
        channel=args.s3_prefix,
        network_type=network_type)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names={'agent': args.s3_bucket},
        base_checkpoint_dir=args.checkpoint_dir,
        s3_folders={'agent': args.s3_prefix},
        s3_endpoint_url=args.s3_endpoint_url)

    graph_manager.data_store_params = ds_params_instance

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    #funzione riga 48
    #prende in input:
    #       - il grafo (creato con la get_graph_manager)
    #       - robomaker_hyperparams_json (ritornato dalla get_graph_manager)

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(
            robomaker_hyperparams_json)["num_episodes_between_training"])
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk', '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('-ek', '--environment_s3_key',
                        help="(string) Name of an environment file to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument('-c', '--checkpoint-dir',
                        help='(string) Path to a folder containing a checkpoint to write the model to.',
                        type=str,
                        default='./checkpoint')
    parser.add_argument('--pretrained-checkpoint-dir',
                        help='(string) Path to a folder for downloading a pre-trained model',
                        type=str,
                        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    start_redis_server()

    args, _ = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path)
    s3_client.upload_file(os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            logger.info("Could not download the preset file. Using the default DeepRacer preset.")
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location, ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path)
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {'model_metadata': model_metadata_local_path,
                        'car_ctrl_cnfig': {ConfigParams.LINK_NAME_LIST.value: [],
                                           ConfigParams.VELOCITY_LIST.value : {},
                                           ConfigParams.STEERING_LIST.value : {},
                                           ConfigParams.CHANGE_START.value : None,
                                           ConfigParams.ALT_DIR.value : None,
                                           ConfigParams.ACTION_SPACE_PATH.value : 'custom_files/model_metadata.json',
                                           ConfigParams.REWARD.value : None,
                                           ConfigParams.AGENT_NAME.value : 'racecar'}}

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))
        #agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(sm_hyperparams_dict, agent_list)

        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        logger.info("Uploaded hyperparameters.json to S3")

    host_ip_address = utils.get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    logger.info("Uploaded IP address information to S3: %s" % host_ip_address)
    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    if use_pretrained_model:
        # Handle backward compatibility
        _, _, version = parse_model_metadata(model_metadata_local_path)
        if float(version) < float(utils.SIMAPP_VERSION) and \
        not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region):
            utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix,
                                  args.aws_region, SyncFiles.TRAINER_READY.value)

        ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                                  bucket_name=args.pretrained_s3_bucket,
                                                                  checkpoint_dir=args.pretrained_checkpoint_dir,
                                                                  s3_folder=args.pretrained_s3_prefix)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained)
        data_store_pretrained.load_from_store()

    memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address="localhost",
                                                               redis_port=6379,
                                                               run_type=str(RunType.TRAINER),
                                                               channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                   bucket_name=args.s3_bucket,
                                                   checkpoint_dir=args.checkpoint_dir,
                                                   s3_folder=args.s3_prefix)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(robomaker_hyperparams_json)["num_episodes_between_training"]
    )
Exemple #5
0
import rl_coach.core_types
from rl_coach import logger
from rl_coach.logger import screen
import argparse
import copy
import logging
import os
import sys
import shutil
import glob
import re

from .configuration_list import ConfigurationList
from rl_coach.coach import CoachLauncher

screen.set_use_colors(
    False)  # Simple text logging so it looks good in CloudWatch


class CoachConfigurationList(ConfigurationList):
    """Helper Object for converting CLI arguments (or SageMaker hyperparameters)
    into Coach configuration.
    """

    # Being security-paranoid and not instantiating any arbitrary string the customer passes in
    ALLOWED_TYPES = {
        "Frames": rl_coach.core_types.Frames,
        "EnvironmentSteps": rl_coach.core_types.EnvironmentSteps,
        "EnvironmentEpisodes": rl_coach.core_types.EnvironmentEpisodes,
        "TrainingSteps": rl_coach.core_types.TrainingSteps,
        "Time": rl_coach.core_types.Time,
    }
Exemple #6
0
def validate(s3_bucket, s3_prefix, custom_files_path, aws_region):
    screen.set_use_colors(False)
    logger.info("S3 bucket: %s \n S3 prefix: %s", s3_bucket, s3_prefix)

    if not os.path.exists(custom_files_path):
        os.makedirs(custom_files_path)
    else:
        GenericValidatorException(
            "Custom Files Path already exists!").log_except_and_exit()

    s3_client = SageS3Client(bucket=s3_bucket,
                             s3_prefix=s3_prefix,
                             aws_region=aws_region)
    # Load the model metadata
    model_metadata_local_path = os.path.join(custom_files_path,
                                             'model_metadata.json')
    utils.load_model_metadata(
        s3_client,
        os.path.normpath("%s/model/model_metadata.json" % s3_prefix),
        model_metadata_local_path)

    # Create model local path
    local_model_dir = os.path.join(custom_files_path, 'checkpoint')
    os.makedirs(local_model_dir)

    try:
        # Handle backward compatibility
        observation_list, _, version = parse_model_metadata(
            model_metadata_local_path)
    except Exception as ex:
        log_and_exit("Failed to parse model_metadata file: {}".format(ex),
                     SIMAPP_VALIDATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    transitions = get_transition_data(observation_list)

    if float(version) < float(SIMAPP_VERSION) and \
            not utils.has_current_ckpnt_name(s3_bucket, s3_prefix, aws_region):
        utils.make_compatible(s3_bucket, s3_prefix, aws_region,
                              SyncFiles.TRAINER_READY.value)

    agent_config = {
        'model_metadata': model_metadata_local_path,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value: [],
            ConfigParams.VELOCITY_LIST.value: {},
            ConfigParams.STEERING_LIST.value: {},
            ConfigParams.CHANGE_START.value: None,
            ConfigParams.ALT_DIR.value: None,
            ConfigParams.ACTION_SPACE_PATH.value: model_metadata_local_path,
            ConfigParams.REWARD.value: None,
            ConfigParams.AGENT_NAME.value: 'racecar'
        }
    }

    agent_list = list()
    agent_list.append(create_training_agent(agent_config))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=None)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=aws_region,
        bucket_names={'agent': s3_bucket},
        s3_folders={'agent': s3_prefix},
        base_checkpoint_dir=local_model_dir)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager,
                                               ignore_lock=True)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = local_model_dir
    _validate(graph_manager=graph_manager,
              task_parameters=task_parameters,
              transitions=transitions,
              s3_bucket=s3_bucket,
              s3_prefix=s3_prefix,
              aws_region=aws_region)
Exemple #7
0
def validate(s3_bucket, s3_prefix, aws_region):
    screen.set_use_colors(False)
    screen.log_title(" S3 bucket: {} \n S3 prefix: {}".format(
        s3_bucket, s3_prefix))

    s3_client = SageS3Client(bucket=s3_bucket,
                             s3_prefix=s3_prefix,
                             aws_region=aws_region)
    # Load the model metadata
    utils.load_model_metadata(
        s3_client,
        os.path.normpath("%s/model/model_metadata.json" % s3_prefix),
        MODEL_METADATA_LOCAL_PATH)

    # Create model local path
    os.makedirs(LOCAL_MODEL_DIR)

    try:
        # Handle backward compatibility
        observation_list, _, version = parse_model_metadata(
            MODEL_METADATA_LOCAL_PATH)
    except Exception as ex:
        log_and_exit("Failed to parse model_metadata file: {}".format(ex),
                     SIMAPP_VALIDATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Below get_transition_data function must called before create_training_agent function
    # to avoid 500 in case unsupported Sensor is received.
    # create_training_agent will exit with 500 if unsupported sensor is received,
    # and get_transition_data function below will exit with 400 if unsupported sensor is received.
    # We want to return 400 in model validation case if unsupported sensor is received.
    # Thus, call this get_transition_data function before create_traning_agent function!
    transitions = get_transition_data(observation_list)

    if float(version) < float(SIMAPP_VERSION) and \
            not utils.has_current_ckpnt_name(s3_bucket, s3_prefix, aws_region):
        utils.make_compatible(s3_bucket, s3_prefix, aws_region,
                              SyncFiles.TRAINER_READY.value)

    agent_config = {
        'model_metadata': MODEL_METADATA_LOCAL_PATH,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value: [],
            ConfigParams.VELOCITY_LIST.value: {},
            ConfigParams.STEERING_LIST.value: {},
            ConfigParams.CHANGE_START.value: None,
            ConfigParams.ALT_DIR.value: None,
            ConfigParams.ACTION_SPACE_PATH.value: MODEL_METADATA_LOCAL_PATH,
            ConfigParams.REWARD.value: None,
            ConfigParams.AGENT_NAME.value: 'racecar'
        }
    }

    agent_list = list()
    agent_list.append(create_training_agent(agent_config))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=None)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=aws_region,
        bucket_names={'agent': s3_bucket},
        s3_folders={'agent': s3_prefix},
        base_checkpoint_dir=LOCAL_MODEL_DIR)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager,
                                               ignore_lock=True)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = LOCAL_MODEL_DIR
    _validate(graph_manager=graph_manager,
              task_parameters=task_parameters,
              transitions=transitions,
              s3_bucket=s3_bucket,
              s3_prefix=s3_prefix,
              aws_region=aws_region)
Exemple #8
0
def validate(s3_bucket, s3_prefix, aws_region):
    screen.set_use_colors(False)
    screen.log_title(" S3 bucket: {} \n S3 prefix: {}".format(
        s3_bucket, s3_prefix))

    # download model metadata
    model_metadata = ModelMetadata(bucket=s3_bucket,
                                   s3_key=get_s3_key(
                                       s3_prefix, MODEL_METADATA_S3_POSTFIX),
                                   region_name=aws_region,
                                   local_path=MODEL_METADATA_LOCAL_PATH)

    # Create model local path
    os.makedirs(LOCAL_MODEL_DIR)

    try:
        # Handle backward compatibility
        model_metadata_info = model_metadata.get_model_metadata_info()
        observation_list = model_metadata_info[ModelMetadataKeys.SENSOR.value]
        version = model_metadata_info[ModelMetadataKeys.VERSION.value]
    except Exception as ex:
        log_and_exit("Failed to parse model_metadata file: {}".format(ex),
                     SIMAPP_VALIDATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Below get_transition_data function must called before create_training_agent function
    # to avoid 500 in case unsupported Sensor is received.
    # create_training_agent will exit with 500 if unsupported sensor is received,
    # and get_transition_data function below will exit with 400 if unsupported sensor is received.
    # We want to return 400 in model validation case if unsupported sensor is received.
    # Thus, call this get_transition_data function before create_traning_agent function!
    transitions = get_transition_data(observation_list)

    checkpoint = Checkpoint(bucket=s3_bucket,
                            s3_prefix=s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=LOCAL_MODEL_DIR)
    # make coach checkpoint compatible
    if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
    ):
        checkpoint.rl_coach_checkpoint.make_compatible(
            checkpoint.syncfile_ready)
    # add checkpoint into checkpoint_dict
    checkpoint_dict = {'agent': checkpoint}

    agent_config = {
        'model_metadata': model_metadata,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value: [],
            ConfigParams.VELOCITY_LIST.value: {},
            ConfigParams.STEERING_LIST.value: {},
            ConfigParams.CHANGE_START.value: None,
            ConfigParams.ALT_DIR.value: None,
            ConfigParams.MODEL_METADATA.value: model_metadata,
            ConfigParams.REWARD.value: None,
            ConfigParams.AGENT_NAME.value: 'racecar'
        }
    }

    agent_list = list()
    agent_list.append(create_training_agent(agent_config))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=None)

    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager,
                                               ignore_lock=True)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = LOCAL_MODEL_DIR
    _validate(graph_manager=graph_manager,
              task_parameters=task_parameters,
              transitions=transitions,
              s3_bucket=s3_bucket,
              s3_prefix=s3_prefix,
              aws_region=aws_region)
Exemple #9
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET",
                                               "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX",
                                               "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=os.environ.get("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=os.environ.get("MODEL_METADATA_FILE_S3_KEY",
                                               None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    print("S3 bucket: %s" % args.s3_bucket)
    print("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)

    redis_ip = s3_client.get_ip()
    print("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        print("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        print("SageMaker hyperparameters not found.")

    preset_file_success = False
    environment_file_success = False
    preset_file_success, environment_file_success = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    if not environment_file_success:
        # Download reward function if environment file is not downloaded
        if not args.reward_file_s3_key:
            raise ValueError("Customer reward S3 key not supplied!")
        download_customer_reward_function(s3_client, args.reward_file_s3_key)
        import markov.environments
        print("Using default environment!")
    else:
        register_custom_environments()
        print("Using custom environment!")

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        print("Using custom preset file!")
    else:
        from markov.sagemaker_graph_manager import get_graph_manager
        graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type='worker',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    rollout_worker(
        graph_manager=graph_manager,
        checkpoint_dir=args.checkpoint_dir,
        data_store=data_store,
        num_workers=args.num_workers,
    )
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET",
                                               "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX",
                                               "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=os.environ.get("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=os.environ.get("MODEL_METADATA_FILE_S3_KEY",
                                               None))
    parser.add_argument('--aws_endpoint_url',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_ENDPOINT_URL", None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region,
                             endpoint_url=args.aws_endpoint_url)
    logger.info("S3 bucket: %s" % args.s3_bucket)
    logger.info("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)

    # Download reward function
    if not args.reward_file_s3_key:
        utils.json_format_logger(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            **utils.build_system_error_dict(
                utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
                utils.SIMAPP_EVENT_ERROR_CODE_500))
        traceback.print_exc()
        utils.simapp_exit_gracefully()
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    # Register the gym enviroment, this will give clients the ability to creat the enviroment object
    register(id=defaults.ENV_ID,
             entry_point=defaults.ENTRY_POINT,
             max_episode_steps=defaults.MAX_STEPS,
             reward_threshold=defaults.THRESHOLD)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        logger.info("SageMaker hyperparameters not found.")

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        from markov.sagemaker_graph_manager import get_graph_manager
        graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    logger.info("Connecting to redis at %s:%d" % (redis_ip, args.redis_port))
    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type='worker',
        channel=args.s3_prefix)

    logger.info("Connecting to s3 boto data store at %s" %
                args.aws_endpoint_url)
    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix,
        aws_endpoint_url=args.aws_endpoint_url)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    rollout_worker(graph_manager=graph_manager,
                   checkpoint_dir=args.checkpoint_dir,
                   data_store=data_store,
                   num_workers=args.num_workers,
                   memory_backend_params=memory_backend_params)
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a local folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help=
        '(string) Path to a local folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--RLCOACH_PRESET',
                        help='(string) Default preset to use',
                        type=str,
                        default='object_tracker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        required=True)

    args, unknown = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)

    # Import to register the environment with Gym
    import robomaker.environments

    preset_location = "robomaker.presets.%s:graph_manager" % args.RLCOACH_PRESET
    graph_manager = short_dynamic_import(preset_location,
                                         ignore_module_case=True)

    host_ip_address = get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    print("Uploaded IP address information to S3: %s" % host_ip_address)

    use_pretrained_model = False
    if args.pretrained_s3_bucket and args.pretrained_s3_prefix:
        s3_client_pretrained = SageS3Client(
            bucket=args.pretrained_s3_bucket,
            s3_prefix=args.pretrained_s3_prefix,
            aws_region=args.aws_region)
        s3_client_pretrained.download_model(PRETRAINED_MODEL_DIR)
        use_pretrained_model = True

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type='trainer',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        s3_folder=args.s3_prefix,
        aws_region=args.aws_region)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    training_worker(graph_manager=graph_manager,
                    checkpoint_dir=args.checkpoint_dir,
                    use_pretrained_model=use_pretrained_model,
                    framework=args.framework)
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--checkpoint-dir',
                        help='(string) Path to a local folder containing a checkpoint to write the model to.',
                        type=str,
                        default='./checkpoint')
    parser.add_argument('--pretrained-checkpoint-dir',
                        help='(string) Path to a local folder for downloading a pre-trained model',
                        type=str,
                        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--RLCOACH_PRESET',
                        help='(string) Default preset to use',
                        type=str,
                        default='deepracer')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        required=True)

    args, unknown = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region)

    # Import to register the environment with Gym
    import robomaker.environments

    preset_location = "robomaker.presets.%s:graph_manager" % args.RLCOACH_PRESET
    graph_manager = short_dynamic_import(preset_location, ignore_module_case=True)

    host_ip_address = get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    print("Uploaded IP address information to S3: %s" % host_ip_address)

    use_pretrained_model = False
    if args.pretrained_s3_bucket and args.pretrained_s3_prefix:
        s3_client_pretrained = SageS3Client(bucket=args.pretrained_s3_bucket,
                                            s3_prefix=args.pretrained_s3_prefix,
                                            aws_region=args.aws_region)
        s3_client_pretrained.download_model(PRETRAINED_MODEL_DIR)
        use_pretrained_model = True

    memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address="localhost",
                                                               redis_port=6379,
                                                               run_type='trainer',
                                                               channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params', memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(bucket_name=args.s3_bucket,
                                                   checkpoint_dir=args.checkpoint_dir,
                                                   s3_folder=args.s3_prefix,
                                                   aws_region=args.aws_region)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    training_worker(
        graph_manager=graph_manager,
        checkpoint_dir=args.checkpoint_dir,
        use_pretrained_model=use_pretrained_model,
        framework=args.framework
    )
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    logger.info("S3 bucket: %s" % args.s3_bucket)
    logger.info("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        utils.log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket,
                    args.s3_prefix), utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        utils.log_and_exit(
            "Failed to import user's reward_function: {}".format(e),
            utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera()

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        logger.info("SageMaker hyperparameters not found.")

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        'car_ctrl_cnfig': {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value:
        rospy.get_param('AWS_REGION'),
        MetricsS3Keys.STEP_BUCKET.value:
        rospy.get_param('SAGEMAKER_SHARED_S3_BUCKET'),
        MetricsS3Keys.STEP_KEY.value:
        os.path.join(rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX'),
                     TRAINING_SIMTRACE_DATA_S3_OBJECT_KEY)
    }

    agent_list = list()
    agent_list.append(
        create_rollout_agent(agent_config, TrainingMetrics(metrics_s3_config)))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(sm_hyperparams_dict, agent_list)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   data_store=data_store,
                   num_workers=args.num_workers,
                   task_parameters=task_parameters)
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained_checkpoint_dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()

    s3_client = S3Client(region_name=args.aws_region, max_retry_attempts=0)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata_download = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=args.model_metadata_s3_key,
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_info = model_metadata_download.get_model_metadata_info()
    network_type = model_metadata_info[ModelMetadataKeys.NEURAL_NETWORK.value]
    version = model_metadata_info[ModelMetadataKeys.VERSION.value]

    # upload model metadata
    model_metadata_upload = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX),
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_upload.persist(
        s3_kms_extra_args=utils.get_s3_kms_extra_args())

    shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        try:
            s3_client.download_file(bucket=args.s3_bucket,
                                    s3_key=args.preset_s3_key,
                                    local_path=preset_local_path)
            success_custom_preset = True
        except botocore.exceptions.ClientError:
            pass
        if not success_custom_preset:
            logger.info(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            s3_client.upload_file(
                bucket=args.s3_bucket,
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path,
                s3_kms_extra_args=utils.get_s3_kms_extra_args())
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {
            'model_metadata': model_metadata_download,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [],
                ConfigParams.VELOCITY_LIST.value: {},
                ConfigParams.STEERING_LIST.value: {},
                ConfigParams.CHANGE_START.value: None,
                ConfigParams.ALT_DIR.value: None,
                ConfigParams.MODEL_METADATA.value: model_metadata_download,
                ConfigParams.REWARD.value: None,
                ConfigParams.AGENT_NAME.value: 'racecar'
            }
        }

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=None,
            run_type=str(RunType.TRAINER))

        # Upload hyperparameters to SageMaker shared s3 bucket
        hyperparameters = Hyperparameters(bucket=args.s3_bucket,
                                          s3_key=get_s3_key(
                                              args.s3_prefix,
                                              HYPERPARAMETER_S3_POSTFIX),
                                          region_name=args.aws_region)
        hyperparameters.persist(
            hyperparams_json=robomaker_hyperparams_json,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())

        # Attach sample collector to graph_manager only if sample count > 0
        max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0))
        if max_sample_count > 0:
            sample_collector = SampleCollector(
                bucket=args.s3_bucket,
                s3_prefix=args.s3_prefix,
                region_name=args.aws_region,
                max_sample_count=max_sample_count,
                sampling_frequency=int(
                    sm_hyperparams_dict.get("sampling_frequency", 1)))
            graph_manager.sample_collector = sample_collector

    # persist IP config from sagemaker to s3
    ip_config = IpConfig(bucket=args.s3_bucket,
                         s3_prefix=args.s3_prefix,
                         region_name=args.aws_region)
    ip_config.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args())

    training_algorithm = model_metadata_download.training_algorithm
    output_head_format = FROZEN_HEAD_OUTPUT_GRAPH_FORMAT_MAPPING[
        training_algorithm]

    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    # Handle backward compatibility
    if use_pretrained_model:
        # checkpoint s3 instance for pretrained model
        # TODO: replace 'agent' for multiagent training
        checkpoint = Checkpoint(bucket=args.pretrained_s3_bucket,
                                s3_prefix=args.pretrained_s3_prefix,
                                region_name=args.aws_region,
                                agent_name='agent',
                                checkpoint_dir=args.pretrained_checkpoint_dir,
                                output_head_format=output_head_format)
        # make coach checkpoint compatible
        if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
        ):
            checkpoint.rl_coach_checkpoint.make_compatible(
                checkpoint.syncfile_ready)
        # get best model checkpoint string
        model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        # Select the best checkpoint model by uploading rl coach .coach_checkpoint file
        checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())
        # add checkpoint into checkpoint_dict
        checkpoint_dict = {'agent': checkpoint}
        # load pretrained model
        ds_params_instance_pretrained = S3BotoDataStoreParameters(
            checkpoint_dict=checkpoint_dict)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained,
                                                graph_manager, True)
        data_store_pretrained.load_from_store()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type=str(RunType.TRAINER),
        channel=args.s3_prefix,
        network_type=network_type)

    graph_manager.memory_backend_params = memory_backend_params

    # checkpoint s3 instance for training model
    checkpoint = Checkpoint(bucket=args.s3_bucket,
                            s3_prefix=args.s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=args.checkpoint_dir,
                            output_head_format=output_head_format)
    checkpoint_dict = {'agent': checkpoint}
    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store_params = ds_params_instance

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(
            robomaker_hyperparams_json)["num_episodes_between_training"],
        training_algorithm=training_algorithm)
def main():
    screen.set_use_colors(False)

    try:
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '-pk',
            '--preset_s3_key',
            help="(string) Name of a preset to download from S3",
            type=str,
            required=False)
        parser.add_argument(
            '-ek',
            '--environment_s3_key',
            help="(string) Name of an environment file to download from S3",
            type=str,
            required=False)
        parser.add_argument('--model_metadata_s3_key',
                            help="(string) Model Metadata File S3 Key",
                            type=str,
                            required=False)
        parser.add_argument(
            '-c',
            '--checkpoint-dir',
            help=
            '(string) Path to a folder containing a checkpoint to write the model to.',
            type=str,
            default='./checkpoint')
        parser.add_argument(
            '--pretrained-checkpoint-dir',
            help=
            '(string) Path to a folder for downloading a pre-trained model',
            type=str,
            default=PRETRAINED_MODEL_DIR)
        parser.add_argument('--s3_bucket',
                            help='(string) S3 bucket',
                            type=str,
                            default=os.environ.get(
                                "SAGEMAKER_SHARED_S3_BUCKET_PATH",
                                "gsaur-test"))
        parser.add_argument('--s3_prefix',
                            help='(string) S3 prefix',
                            type=str,
                            default='sagemaker')
        parser.add_argument('--framework',
                            help='(string) tensorflow or mxnet',
                            type=str,
                            default='tensorflow')
        parser.add_argument('--pretrained_s3_bucket',
                            help='(string) S3 bucket for pre-trained model',
                            type=str)
        parser.add_argument('--pretrained_s3_prefix',
                            help='(string) S3 prefix for pre-trained model',
                            type=str,
                            default='sagemaker')
        parser.add_argument('--aws_region',
                            help='(string) AWS region',
                            type=str,
                            default=os.environ.get("AWS_REGION", "us-east-1"))

        args, unknown = parser.parse_known_args()
        start_redis_server()

        s3_client = SageS3Client(bucket=args.s3_bucket,
                                 s3_prefix=args.s3_prefix,
                                 aws_region=args.aws_region)

        # Load the model metadata
        model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                                 'model_metadata.json')
        load_model_metadata(s3_client, args.model_metadata_s3_key,
                            model_metadata_local_path)
        s3_client.upload_file(
            os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
            model_metadata_local_path)
        shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

        # Register the gym enviroment, this will give clients the ability to creat the enviroment object
        register(id=defaults.ENV_ID,
                 entry_point=defaults.ENTRY_POINT,
                 max_episode_steps=defaults.MAX_STEPS,
                 reward_threshold=defaults.THRESHOLD)

        user_batch_size, user_episode_per_rollout = None, None
        success_custom_preset = False
        if args.preset_s3_key:
            preset_local_path = "./markov/presets/preset.py"
            success_custom_preset = s3_client.download_file(
                s3_key=args.preset_s3_key, local_path=preset_local_path)
            if not success_custom_preset:
                logger.info(
                    "Could not download the preset file. Using the default DeepRacer preset."
                )
            else:
                preset_location = "markov.presets.preset:graph_manager"
                graph_manager = short_dynamic_import(preset_location,
                                                     ignore_module_case=True)
                success_custom_preset = s3_client.upload_file(
                    s3_key=os.path.normpath("%s/presets/preset.py" %
                                            args.s3_prefix),
                    local_path=preset_local_path)
                if success_custom_preset:
                    agent_param_loc = "markov.presets.preset:agent_params"
                    agent_params = short_dynamic_import(
                        agent_param_loc, ignore_module_case=True)
                    user_batch_size = agent_params.network_wrappers[
                        'main'].batch_size
                    user_episode_per_rollout = agent_params.algorithm.num_consecutive_playing_steps.num_steps
                    logger.info("Using preset: %s" % args.preset_s3_key)

        if not success_custom_preset:
            from markov.sagemaker_graph_manager import get_graph_manager
            user_batch_size = json.loads(
                robomaker_hyperparams_json)["batch_size"],
            user_episode_per_rollout = json.loads(
                robomaker_hyperparams_json)["num_episodes_between_training"]
            params_blob = os.environ.get('SM_TRAINING_ENV', '')
            if params_blob:
                params = json.loads(params_blob)
                sm_hyperparams_dict = params["hyperparameters"]
            else:
                sm_hyperparams_dict = {}
            graph_manager, robomaker_hyperparams_json = get_graph_manager(
                **sm_hyperparams_dict)
            s3_client.upload_hyperparameters(robomaker_hyperparams_json)
            logger.info("Uploaded hyperparameters.json to S3")

        host_ip_address = get_ip_from_host()
        s3_client.write_ip_config(host_ip_address)
        logger.info("Uploaded IP address information to S3: %s" %
                    host_ip_address)
        use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
        if use_pretrained_model:
            s3_client_pretrained = SageS3Client(
                bucket=args.pretrained_s3_bucket,
                s3_prefix=args.pretrained_s3_prefix,
                aws_region=args.aws_region)
            s3_client_pretrained.download_model(args.pretrained_checkpoint_dir)

        memory_backend_params = RedisPubSubMemoryBackendParameters(
            redis_address="localhost",
            redis_port=6379,
            run_type='trainer',
            channel=args.s3_prefix)

        ds_params_instance = S3BotoDataStoreParameters(
            bucket_name=args.s3_bucket,
            checkpoint_dir=args.checkpoint_dir,
            aws_region=args.aws_region,
            s3_folder=args.s3_prefix)
        graph_manager.data_store_params = ds_params_instance

        data_store = S3BotoDataStore(ds_params_instance)
        data_store.graph_manager = graph_manager
        graph_manager.data_store = data_store

        training_worker(graph_manager=graph_manager,
                        checkpoint_dir=args.checkpoint_dir,
                        use_pretrained_model=use_pretrained_model,
                        framework=args.framework,
                        memory_backend_params=memory_backend_params,
                        user_batch_size=user_batch_size,
                        user_episode_per_rollout=user_episode_per_rollout)
    except Exception as ex:
        utils.json_format_logger(
            "Training worker exited with exception: {}".format(ex),
            **utils.build_system_error_dict(
                utils.SIMAPP_TRAINING_WORKER_EXCEPTION,
                utils.SIMAPP_EVENT_ERROR_CODE_500))
        utils.simapp_exit_gracefully()
Exemple #16
0
    def get_config_args(self,
                        parser: argparse.ArgumentParser) -> argparse.Namespace:
        """
        Returns a Namespace object with all the user-specified configuration options needed to launch.
        This implementation uses argparse to take arguments from the CLI, but this can be over-ridden by
        another method that gets its configuration from elsewhere.  An equivalent method however must
        return an identically structured Namespace object, which conforms to the structure defined by
        get_argument_parser.

        This method parses the arguments that the user entered, does some basic validation, and
        modification of user-specified values in short form to be more explicit.

        :param parser: a parser object which implicitly defines the format of the Namespace that
                       is expected to be returned.
        :return: the parsed arguments as a Namespace
        """
        args = parser.parse_args()

        if args.nocolor:
            screen.set_use_colors(False)

        # if no arg is given
        if len(sys.argv) == 1:
            parser.print_help()
            sys.exit(1)

        # list available presets
        if args.list:
            self.display_all_presets_and_exit()

        # Read args from config file for distributed Coach.
        if args.distributed_coach and args.distributed_coach_run_type == RunType.ORCHESTRATOR:
            coach_config = ConfigParser({
                'image': '',
                'memory_backend': 'redispubsub',
                'data_store': 's3',
                's3_end_point': 's3.amazonaws.com',
                's3_bucket_name': '',
                's3_creds_file': ''
            })
            try:
                coach_config.read(args.distributed_coach_config_path)
                args.image = coach_config.get('coach', 'image')
                args.memory_backend = coach_config.get('coach',
                                                       'memory_backend')
                args.data_store = coach_config.get('coach', 'data_store')
                if args.data_store == 's3':
                    args.s3_end_point = coach_config.get(
                        'coach', 's3_end_point')
                    args.s3_bucket_name = coach_config.get(
                        'coach', 's3_bucket_name')
                    args.s3_creds_file = coach_config.get(
                        'coach', 's3_creds_file')
            except Error as e:
                screen.error(
                    "Error when reading distributed Coach config file: {}".
                    format(e))

            if args.image == '':
                screen.error("Image cannot be empty.")

            data_store_choices = ['s3', 'nfs']
            if args.data_store not in data_store_choices:
                screen.warning("{} data store is unsupported.".format(
                    args.data_store))
                screen.error(
                    "Supported data stores are {}.".format(data_store_choices))

            memory_backend_choices = ['redispubsub']
            if args.memory_backend not in memory_backend_choices:
                screen.warning("{} memory backend is not supported.".format(
                    args.memory_backend))
                screen.error("Supported memory backends are {}.".format(
                    memory_backend_choices))

            if args.data_store == 's3':
                if args.s3_bucket_name == '':
                    screen.error("S3 bucket name cannot be empty.")
                if args.s3_creds_file == '':
                    args.s3_creds_file = None

        if args.play and args.distributed_coach:
            screen.error("Playing is not supported in distributed Coach.")

        # replace a short preset name with the full path
        if args.preset is not None:
            args.preset = self.expand_preset(args.preset)

        # validate the checkpoints args
        if args.checkpoint_restore_dir is not None and not os.path.exists(
                args.checkpoint_restore_dir):
            screen.error(
                "The requested checkpoint folder to load from does not exist.")

        # validate the checkpoints args
        if args.checkpoint_restore_file is not None and not glob(
                args.checkpoint_restore_file + '*'):
            screen.error(
                "The requested checkpoint file to load from does not exist.")

        # no preset was given. check if the user requested to play some environment on its own
        if args.preset is None and args.play and not args.environment_type:
            screen.error(
                'When no preset is given for Coach to run, and the user requests human control over '
                'the environment, the user is expected to input the desired environment_type and level.'
                '\nAt least one of these parameters was not given.')
        elif args.preset and args.play:
            screen.error(
                "Both the --preset and the --play flags were set. These flags can not be used together. "
                "For human control, please use the --play flag together with the environment type flag (-et)"
            )
        elif args.preset is None and not args.play:
            screen.error(
                "Please choose a preset using the -p flag or use the --play flag together with choosing an "
                "environment type (-et) in order to play the game.")

        # get experiment name and path
        args.experiment_name = logger.get_experiment_name(args.experiment_name)
        args.experiment_path = logger.get_experiment_path(args.experiment_name)

        if args.play and args.num_workers > 1:
            screen.warning(
                "Playing the game as a human is only available with a single worker. "
                "The number of workers will be reduced to 1")
            args.num_workers = 1

        args.framework = Frameworks[args.framework.lower()]

        # checkpoints
        args.checkpoint_save_dir = os.path.join(
            args.experiment_path,
            'checkpoint') if args.checkpoint_save_secs is not None else None

        if args.export_onnx_graph and not args.checkpoint_save_secs:
            screen.warning(
                "Exporting ONNX graphs requires setting the --checkpoint_save_secs flag. "
                "The --export_onnx_graph will have no effect.")

        return args
Exemple #17
0
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num_workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=int(rospy.get_param("NUM_WORKERS", 1)))
    parser.add_argument('--rollout_idx',
                        help="(int) The index of current rollout worker",
                        type=int,
                        default=0)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)

    # Download and import reward function
    # TODO: replace 'agent' with name of each agent for multi-agent training
    reward_function_file = RewardFunction(
        bucket=args.s3_bucket,
        s3_key=args.reward_file_s3_key,
        region_name=args.aws_region,
        local_path=REWARD_FUCTION_LOCAL_PATH_FORMAT.format('agent'))
    reward_function = reward_function_file.get_reward_function()

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_bucket=args.s3_bucket,
        s3_prefix=args.s3_prefix,
        aws_region=args.aws_region)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=args.model_metadata_s3_key,
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_info = model_metadata.get_model_metadata_info()
    version = model_metadata_info[ModelMetadataKeys.VERSION.value]

    agent_config = {
        'model_metadata': model_metadata,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.MODEL_METADATA.value:
            model_metadata,
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_key = rospy.get_param('METRICS_S3_OBJECT_KEY')
    if args.num_workers > 1 and args.rollout_idx > 0:
        key_tuple = os.path.splitext(metrics_key)
        metrics_key = "{}_{}{}".format(key_tuple[0], str(args.rollout_idx),
                                       key_tuple[1])
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value: metrics_key,
        MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')
    }

    run_phase_subject = RunPhaseSubject()

    agent_list = list()

    #TODO: replace agent for multi agent training
    # checkpoint s3 instance
    # TODO replace agent with agent_0 and so on for multiagent case
    checkpoint = Checkpoint(bucket=args.s3_bucket,
                            s3_prefix=args.s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=args.checkpoint_dir)

    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics(
                agent_name='agent',
                s3_dict_metrics=metrics_s3_config,
                deepracer_checkpoint_json=checkpoint.deepracer_checkpoint_json,
                ckpnt_dir=os.path.join(args.checkpoint_dir, 'agent'),
                run_phase_sink=run_phase_subject,
                use_model_picker=(args.rollout_idx == 0)), run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET',
                                    None) if args.rollout_idx == 0 else None
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        if args.num_workers > 1:
            simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix,
                                                     str(args.rollout_idx))
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    simtrace_video_s3_writers = []
    #TODO: replace 'agent' with 'agent_0' for multi agent training and
    # mp4_s3_object_prefix, mp4_s3_bucket will be a list, so need to access with index
    if simtrace_s3_bucket:
        simtrace_video_s3_writers.append(
            SimtraceVideo(
                upload_type=SimtraceVideoNames.SIMTRACE_TRAINING.value,
                bucket=simtrace_s3_bucket,
                s3_prefix=simtrace_s3_object_prefix,
                region_name=aws_region,
                local_path=SIMTRACE_TRAINING_LOCAL_PATH_FORMAT.format(
                    'agent')))
    if mp4_s3_bucket:
        simtrace_video_s3_writers.extend([
            SimtraceVideo(
                upload_type=SimtraceVideoNames.PIP.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format('agent')),
            SimtraceVideo(
                upload_type=SimtraceVideoNames.DEGREE45.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format('agent')),
            SimtraceVideo(
                upload_type=SimtraceVideoNames.TOPVIEW.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format('agent'))
        ])

    # TODO: replace 'agent' with specific agent name for multi agent training
    ip_config = IpConfig(bucket=args.s3_bucket,
                         s3_prefix=args.s3_prefix,
                         region_name=args.aws_region,
                         local_path=IP_ADDRESS_LOCAL_PATH.format('agent'))
    redis_ip = ip_config.get_ip_config()

    # Download hyperparameters from SageMaker shared s3 bucket
    # TODO: replace 'agent' with name of each agent
    hyperparameters = Hyperparameters(
        bucket=args.s3_bucket,
        s3_key=get_s3_key(args.s3_prefix, HYPERPARAMETER_S3_POSTFIX),
        region_name=args.aws_region,
        local_path=HYPERPARAMETER_LOCAL_PATH_FORMAT.format('agent'))
    sm_hyperparams_dict = hyperparameters.get_hyperparameters_dict()

    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject,
            enable_domain_randomization=enable_domain_randomization,
            pause_physics=pause_physics,
            unpause_physics=unpause_physics)

    # If num_episodes_between_training is smaller than num_workers then cancel worker early.
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    # Reduce number of workers if allocated more than num_episodes_between_training
    if args.num_workers > episode_steps_per_rollout:
        logger.info(
            "Excess worker allocated. Reducing from {} to {}...".format(
                args.num_workers, episode_steps_per_rollout))
        args.num_workers = episode_steps_per_rollout
    if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers:
        err_msg_format = "Exiting excess worker..."
        err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])"
        logger.info(
            err_msg_format.format(args.rollout_idx, args.num_workers,
                                  episode_steps_per_rollout))
        # Close the down the job
        utils.cancel_simulation_job()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix,
        num_workers=args.num_workers,
        rollout_idx=args.rollout_idx)

    graph_manager.memory_backend_params = memory_backend_params

    checkpoint_dict = {'agent': checkpoint}
    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   rollout_idx=args.rollout_idx,
                   task_parameters=task_parameters,
                   simtrace_video_s3_writers=simtrace_video_s3_writers,
                   pause_physics=pause_physics,
                   unpause_physics=unpause_physics)
Exemple #18
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, unknown = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)
    s3_client.upload_file(
        os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
        model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_environment = False
    if args.environment_s3_key:
        environment_local_path = "./markov/environments/deepracer_racetrack_env.py"
        success_custom_environment = s3_client.download_file(
            s3_key=args.environment_s3_key, local_path=environment_local_path)
        if not success_custom_environment:
            print(
                "Could not download the environment file. Using the default DeepRacer environment."
            )
        else:
            success_custom_environment = s3_client.upload_file(
                s3_key=os.path.normpath(
                    "%s/environments/deepracer_racetrack_env.py" %
                    args.s3_prefix),
                local_path=environment_local_path)
            if success_custom_environment:
                print("Using environment: %s" % args.environment_s3_key)

    # Import to register the environment with Gym
    import markov.environments

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(
            s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            print(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path)
            if success_custom_preset:
                print("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        from markov.sagemaker_graph_manager import get_graph_manager
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}
        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            **sm_hyperparams_dict)
        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        print("Uploaded hyperparameters.json to S3")

    host_ip_address = get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    print("Uploaded IP address information to S3: %s" % host_ip_address)

    use_pretrained_model = False
    if args.pretrained_s3_bucket and args.pretrained_s3_prefix:
        s3_client_pretrained = SageS3Client(
            bucket=args.pretrained_s3_bucket,
            s3_prefix=args.pretrained_s3_prefix,
            aws_region=args.aws_region)
        s3_client_pretrained.download_model(args.pretrained_checkpoint_dir)
        use_pretrained_model = True

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type='trainer',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    training_worker(graph_manager=graph_manager,
                    checkpoint_dir=args.checkpoint_dir,
                    use_pretrained_model=use_pretrained_model,
                    framework=args.framework)
import rl_coach.core_types 
from rl_coach import logger
from rl_coach.logger import screen
import argparse
import copy
import logging
import os
import sys
import shutil
import glob
import re

from .configuration_list import ConfigurationList
from rl_coach.coach import CoachLauncher

screen.set_use_colors(False)  # Simple text logging so it looks good in CloudWatch

class CoachConfigurationList(ConfigurationList):
    """Helper Object for converting CLI arguments (or SageMaker hyperparameters) 
    into Coach configuration.
    """

    # Being security-paranoid and not instantiating any arbitrary string the customer passes in
    ALLOWED_TYPES = {
        'Frames': rl_coach.core_types.Frames,
        'EnvironmentSteps': rl_coach.core_types.EnvironmentSteps,
        'EnvironmentEpisodes': rl_coach.core_types.EnvironmentEpisodes,
        'TrainingSteps': rl_coach.core_types.TrainingSteps,
        'Time': rl_coach.core_types.Time,
    }