Example #1
0
    def _setup_graph_manager(self, checkpoint, agent_list):
        """Sets up graph manager based on the checkpoint file and agents list.

        Args:
            checkpoint (Checkpoint): The model checkpoints we just downloaded.
            agent_list (list[Agent]): List of agents we want to setup graph manager for.
        """
        sm_hyperparams_dict = {}
        self._current_graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict, agent_list=agent_list,
                                                           run_phase_subject=self._run_phase_subject,
                                                           enable_domain_randomization=self._enable_domain_randomization,
                                                           done_condition=self._done_condition,
                                                           pause_physics=self._model_updater.pause_physics_service,
                                                           unpause_physics=self._model_updater.unpause_physics_service)
        checkpoint_dict = dict()
        checkpoint_dict[self._agent_name] = checkpoint
        ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict)

        self._current_graph_manager.data_store = S3BotoDataStore(params=ds_params_instance,
                                                                 graph_manager=self._current_graph_manager,
                                                                 ignore_lock=True,
                                                                 log_and_cont=True)
        self._current_graph_manager.env_params.seed = 0

        self._current_graph_manager.data_store.wait_for_checkpoints()
        self._current_graph_manager.data_store.modify_checkpoint_variables()

        task_parameters = TaskParameters()
        task_parameters.checkpoint_restore_path = self._local_model_directory

        self._current_graph_manager.create_graph(task_parameters=task_parameters,
                                                 stop_physics=self._model_updater.pause_physics_service,
                                                 start_physics=self._model_updater.unpause_physics_service,
                                                 empty_service_call=EmptyRequest)
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num_workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=int(rospy.get_param("NUM_WORKERS", 1)))
    parser.add_argument('--rollout_idx',
                        help="(int) The index of current rollout worker",
                        type=int,
                        default=0)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        log_and_exit("Failed to import user's reward_function: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    step_metrics_prefix = rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX')
    if args.num_workers > 1:
        step_metrics_prefix = os.path.join(step_metrics_prefix,
                                           str(args.rollout_idx))
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')
    }
    metrics_s3_model_cfg = {
        MetricsS3Keys.METRICS_BUCKET.value:
        args.s3_bucket,
        MetricsS3Keys.METRICS_KEY.value:
        os.path.join(args.s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX),
        MetricsS3Keys.REGION.value:
        args.aws_region
    }
    run_phase_subject = RunPhaseSubject()

    agent_list = list()
    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics(agent_name='agent',
                            s3_dict_metrics=metrics_s3_config,
                            s3_dict_model=metrics_s3_model_cfg,
                            ckpnt_dir=args.checkpoint_dir,
                            run_phase_sink=run_phase_subject,
                            use_model_picker=(args.rollout_idx == 0)),
            run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET',
                                    None) if args.rollout_idx == 0 else None
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        if args.num_workers > 1:
            simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix,
                                                     str(args.rollout_idx))
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    s3_writer_job_info = []
    if simtrace_s3_bucket:
        s3_writer_job_info.append(
            IterationData(
                'simtrace', simtrace_s3_bucket, simtrace_s3_object_prefix,
                aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.SIM_TRACE_TRAINING_LOCAL_FILE.
                    value)))
    if mp4_s3_bucket:
        s3_writer_job_info.extend([
            IterationData(
                'pip', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                '45degree', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                'topview', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value))
        ])

    s3_writer = S3Writer(job_info=s3_writer_job_info)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s", redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as filepointer:
            sm_hyperparams_dict = json.load(filepointer)
    else:
        logger.info("SageMaker hyperparameters not found.")

    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))
    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject,
            enable_domain_randomization=enable_domain_randomization)

    # If num_episodes_between_training is smaller than num_workers then cancel worker early.
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    # Reduce number of workers if allocated more than num_episodes_between_training
    if args.num_workers > episode_steps_per_rollout:
        logger.info(
            "Excess worker allocated. Reducing from {} to {}...".format(
                args.num_workers, episode_steps_per_rollout))
        args.num_workers = episode_steps_per_rollout
    if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers:
        err_msg_format = "Exiting excess worker..."
        err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])"
        logger.info(
            err_msg_format.format(args.rollout_idx, args.num_workers,
                                  episode_steps_per_rollout))
        # Close the down the job
        utils.cancel_simulation_job(
            os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
            rospy.get_param('AWS_REGION'))

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix,
        num_workers=args.num_workers,
        rollout_idx=args.rollout_idx)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names={'agent': args.s3_bucket},
        base_checkpoint_dir=args.checkpoint_dir,
        s3_folders={'agent': args.s3_prefix})

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   rollout_idx=args.rollout_idx,
                   task_parameters=task_parameters,
                   s3_writer=s3_writer)
Example #3
0
def main():
    screen.set_use_colors(False)

    logger.info("src/training_worker.py - INIZIO MAIN")

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--s3_endpoint_url',
                        help='(string) S3 endpoint URL',
                        type=str,
                        default=os.environ.get("S3_ENDPOINT_URL", None))
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()
    logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s",
                args.s3_bucket, args.s3_prefix, args.s3_endpoint_url)

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region,
                             s3_endpoint_url=args.s3_endpoint_url)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)
    s3_client.upload_file(
        os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
        model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(
            s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            logger.info(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path)
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #configurazione agente: metadati del modello impostati dall'utente (angolo di sterzo + velocità) + nome

        #! TODO each agent should have own config
        agent_config = {
            'model_metadata': model_metadata_local_path,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [],
                ConfigParams.VELOCITY_LIST.value: {},
                ConfigParams.STEERING_LIST.value: {},
                ConfigParams.CHANGE_START.value: None,
                ConfigParams.ALT_DIR.value: None,
                ConfigParams.ACTION_SPACE_PATH.value:
                'custom_files/model_metadata.json',
                ConfigParams.REWARD.value: None,
                ConfigParams.AGENT_NAME.value: 'racecar'
            }
        }

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))

        logger.info(
            "src/training_worker.py - ora chiamo la get_graph_manager, che recupera l'agente"
        )

        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=None)

        logger.info("src/training_worker.py - ho l'agente")

        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        logger.info("Uploaded hyperparameters.json to S3")

        # Attach sample collector to graph_manager only if sample count > 0
        max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0))
        if max_sample_count > 0:
            sample_collector = SampleCollector(
                s3_client=s3_client,
                s3_prefix=args.s3_prefix,
                max_sample_count=max_sample_count,
                sampling_frequency=int(
                    sm_hyperparams_dict.get("sampling_frequency", 1)))
            graph_manager.sample_collector = sample_collector

    host_ip_address = utils.get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    logger.info("Uploaded IP address information to S3: %s" % host_ip_address)
    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    # Handle backward compatibility
    _, network_type, version = parse_model_metadata(model_metadata_local_path)
    if use_pretrained_model:
        if float(version) < float(SIMAPP_VERSION) and \
        not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, args.s3_endpoint_url):
            utils.make_compatible(args.pretrained_s3_bucket,
                                  args.pretrained_s3_prefix, args.aws_region,
                                  SyncFiles.TRAINER_READY.value)
        #Select the optimal model for the starting weights
        utils.do_model_selection(s3_bucket=args.s3_bucket,
                                 s3_prefix=args.s3_prefix,
                                 region=args.aws_region,
                                 s3_endpoint_url=args.s3_endpoint_url)

        ds_params_instance_pretrained = S3BotoDataStoreParameters(
            aws_region=args.aws_region,
            bucket_names={'agent': args.pretrained_s3_bucket},
            base_checkpoint_dir=args.pretrained_checkpoint_dir,
            s3_folders={'agent': args.pretrained_s3_prefix},
            s3_endpoint_url=args.s3_endpoint_url)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained,
                                                graph_manager, True)
        data_store_pretrained.load_from_store()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type=str(RunType.TRAINER),
        channel=args.s3_prefix,
        network_type=network_type)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names={'agent': args.s3_bucket},
        base_checkpoint_dir=args.checkpoint_dir,
        s3_folders={'agent': args.s3_prefix},
        s3_endpoint_url=args.s3_endpoint_url)

    graph_manager.data_store_params = ds_params_instance

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    #funzione riga 48
    #prende in input:
    #       - il grafo (creato con la get_graph_manager)
    #       - robomaker_hyperparams_json (ritornato dalla get_graph_manager)

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(
            robomaker_hyperparams_json)["num_episodes_between_training"])
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk', '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('-ek', '--environment_s3_key',
                        help="(string) Name of an environment file to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument('-c', '--checkpoint-dir',
                        help='(string) Path to a folder containing a checkpoint to write the model to.',
                        type=str,
                        default='./checkpoint')
    parser.add_argument('--pretrained-checkpoint-dir',
                        help='(string) Path to a folder for downloading a pre-trained model',
                        type=str,
                        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    start_redis_server()

    args, _ = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path)
    s3_client.upload_file(os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            logger.info("Could not download the preset file. Using the default DeepRacer preset.")
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location, ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path)
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {'model_metadata': model_metadata_local_path,
                        'car_ctrl_cnfig': {ConfigParams.LINK_NAME_LIST.value: [],
                                           ConfigParams.VELOCITY_LIST.value : {},
                                           ConfigParams.STEERING_LIST.value : {},
                                           ConfigParams.CHANGE_START.value : None,
                                           ConfigParams.ALT_DIR.value : None,
                                           ConfigParams.ACTION_SPACE_PATH.value : 'custom_files/model_metadata.json',
                                           ConfigParams.REWARD.value : None,
                                           ConfigParams.AGENT_NAME.value : 'racecar'}}

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))
        #agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(sm_hyperparams_dict, agent_list)

        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        logger.info("Uploaded hyperparameters.json to S3")

    host_ip_address = utils.get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    logger.info("Uploaded IP address information to S3: %s" % host_ip_address)
    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    if use_pretrained_model:
        # Handle backward compatibility
        _, _, version = parse_model_metadata(model_metadata_local_path)
        if float(version) < float(utils.SIMAPP_VERSION) and \
        not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region):
            utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix,
                                  args.aws_region, SyncFiles.TRAINER_READY.value)

        ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                                  bucket_name=args.pretrained_s3_bucket,
                                                                  checkpoint_dir=args.pretrained_checkpoint_dir,
                                                                  s3_folder=args.pretrained_s3_prefix)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained)
        data_store_pretrained.load_from_store()

    memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address="localhost",
                                                               redis_port=6379,
                                                               run_type=str(RunType.TRAINER),
                                                               channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                   bucket_name=args.s3_bucket,
                                                   checkpoint_dir=args.checkpoint_dir,
                                                   s3_folder=args.s3_prefix)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(robomaker_hyperparams_json)["num_episodes_between_training"]
    )
Example #5
0
def validate(s3_bucket, s3_prefix, custom_files_path, aws_region):
    screen.set_use_colors(False)
    logger.info("S3 bucket: %s \n S3 prefix: %s", s3_bucket, s3_prefix)

    if not os.path.exists(custom_files_path):
        os.makedirs(custom_files_path)
    else:
        GenericValidatorException(
            "Custom Files Path already exists!").log_except_and_exit()

    s3_client = SageS3Client(bucket=s3_bucket,
                             s3_prefix=s3_prefix,
                             aws_region=aws_region)
    # Load the model metadata
    model_metadata_local_path = os.path.join(custom_files_path,
                                             'model_metadata.json')
    utils.load_model_metadata(
        s3_client,
        os.path.normpath("%s/model/model_metadata.json" % s3_prefix),
        model_metadata_local_path)

    # Create model local path
    local_model_dir = os.path.join(custom_files_path, 'checkpoint')
    os.makedirs(local_model_dir)

    try:
        # Handle backward compatibility
        observation_list, _, version = parse_model_metadata(
            model_metadata_local_path)
    except Exception as ex:
        log_and_exit("Failed to parse model_metadata file: {}".format(ex),
                     SIMAPP_VALIDATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    transitions = get_transition_data(observation_list)

    if float(version) < float(SIMAPP_VERSION) and \
            not utils.has_current_ckpnt_name(s3_bucket, s3_prefix, aws_region):
        utils.make_compatible(s3_bucket, s3_prefix, aws_region,
                              SyncFiles.TRAINER_READY.value)

    agent_config = {
        'model_metadata': model_metadata_local_path,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value: [],
            ConfigParams.VELOCITY_LIST.value: {},
            ConfigParams.STEERING_LIST.value: {},
            ConfigParams.CHANGE_START.value: None,
            ConfigParams.ALT_DIR.value: None,
            ConfigParams.ACTION_SPACE_PATH.value: model_metadata_local_path,
            ConfigParams.REWARD.value: None,
            ConfigParams.AGENT_NAME.value: 'racecar'
        }
    }

    agent_list = list()
    agent_list.append(create_training_agent(agent_config))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=None)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=aws_region,
        bucket_names={'agent': s3_bucket},
        s3_folders={'agent': s3_prefix},
        base_checkpoint_dir=local_model_dir)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager,
                                               ignore_lock=True)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = local_model_dir
    _validate(graph_manager=graph_manager,
              task_parameters=task_parameters,
              transitions=transitions,
              s3_bucket=s3_bucket,
              s3_prefix=s3_prefix,
              aws_region=aws_region)
Example #6
0
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num_workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=int(rospy.get_param("NUM_WORKERS", 1)))
    parser.add_argument('--rollout_idx',
                        help="(int) The index of current rollout worker",
                        type=int,
                        default=0)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)

    # Download and import reward function
    # TODO: replace 'agent' with name of each agent for multi-agent training
    reward_function_file = RewardFunction(
        bucket=args.s3_bucket,
        s3_key=args.reward_file_s3_key,
        region_name=args.aws_region,
        local_path=REWARD_FUCTION_LOCAL_PATH_FORMAT.format('agent'))
    reward_function = reward_function_file.get_reward_function()

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_bucket=args.s3_bucket,
        s3_prefix=args.s3_prefix,
        aws_region=args.aws_region)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=args.model_metadata_s3_key,
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_info = model_metadata.get_model_metadata_info()
    version = model_metadata_info[ModelMetadataKeys.VERSION.value]

    agent_config = {
        'model_metadata': model_metadata,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.MODEL_METADATA.value:
            model_metadata,
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_key = rospy.get_param('METRICS_S3_OBJECT_KEY')
    if args.num_workers > 1 and args.rollout_idx > 0:
        key_tuple = os.path.splitext(metrics_key)
        metrics_key = "{}_{}{}".format(key_tuple[0], str(args.rollout_idx),
                                       key_tuple[1])
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value: metrics_key,
        MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')
    }

    run_phase_subject = RunPhaseSubject()

    agent_list = list()

    #TODO: replace agent for multi agent training
    # checkpoint s3 instance
    # TODO replace agent with agent_0 and so on for multiagent case
    checkpoint = Checkpoint(bucket=args.s3_bucket,
                            s3_prefix=args.s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=args.checkpoint_dir)

    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics(
                agent_name='agent',
                s3_dict_metrics=metrics_s3_config,
                deepracer_checkpoint_json=checkpoint.deepracer_checkpoint_json,
                ckpnt_dir=os.path.join(args.checkpoint_dir, 'agent'),
                run_phase_sink=run_phase_subject,
                use_model_picker=(args.rollout_idx == 0)), run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET',
                                    None) if args.rollout_idx == 0 else None
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        if args.num_workers > 1:
            simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix,
                                                     str(args.rollout_idx))
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    simtrace_video_s3_writers = []
    #TODO: replace 'agent' with 'agent_0' for multi agent training and
    # mp4_s3_object_prefix, mp4_s3_bucket will be a list, so need to access with index
    if simtrace_s3_bucket:
        simtrace_video_s3_writers.append(
            SimtraceVideo(
                upload_type=SimtraceVideoNames.SIMTRACE_TRAINING.value,
                bucket=simtrace_s3_bucket,
                s3_prefix=simtrace_s3_object_prefix,
                region_name=aws_region,
                local_path=SIMTRACE_TRAINING_LOCAL_PATH_FORMAT.format(
                    'agent')))
    if mp4_s3_bucket:
        simtrace_video_s3_writers.extend([
            SimtraceVideo(
                upload_type=SimtraceVideoNames.PIP.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format('agent')),
            SimtraceVideo(
                upload_type=SimtraceVideoNames.DEGREE45.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format('agent')),
            SimtraceVideo(
                upload_type=SimtraceVideoNames.TOPVIEW.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format('agent'))
        ])

    # TODO: replace 'agent' with specific agent name for multi agent training
    ip_config = IpConfig(bucket=args.s3_bucket,
                         s3_prefix=args.s3_prefix,
                         region_name=args.aws_region,
                         local_path=IP_ADDRESS_LOCAL_PATH.format('agent'))
    redis_ip = ip_config.get_ip_config()

    # Download hyperparameters from SageMaker shared s3 bucket
    # TODO: replace 'agent' with name of each agent
    hyperparameters = Hyperparameters(
        bucket=args.s3_bucket,
        s3_key=get_s3_key(args.s3_prefix, HYPERPARAMETER_S3_POSTFIX),
        region_name=args.aws_region,
        local_path=HYPERPARAMETER_LOCAL_PATH_FORMAT.format('agent'))
    sm_hyperparams_dict = hyperparameters.get_hyperparameters_dict()

    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject,
            enable_domain_randomization=enable_domain_randomization,
            pause_physics=pause_physics,
            unpause_physics=unpause_physics)

    # If num_episodes_between_training is smaller than num_workers then cancel worker early.
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    # Reduce number of workers if allocated more than num_episodes_between_training
    if args.num_workers > episode_steps_per_rollout:
        logger.info(
            "Excess worker allocated. Reducing from {} to {}...".format(
                args.num_workers, episode_steps_per_rollout))
        args.num_workers = episode_steps_per_rollout
    if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers:
        err_msg_format = "Exiting excess worker..."
        err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])"
        logger.info(
            err_msg_format.format(args.rollout_idx, args.num_workers,
                                  episode_steps_per_rollout))
        # Close the down the job
        utils.cancel_simulation_job()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix,
        num_workers=args.num_workers,
        rollout_idx=args.rollout_idx)

    graph_manager.memory_backend_params = memory_backend_params

    checkpoint_dict = {'agent': checkpoint}
    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   rollout_idx=args.rollout_idx,
                   task_parameters=task_parameters,
                   simtrace_video_s3_writers=simtrace_video_s3_writers,
                   pause_physics=pause_physics,
                   unpause_physics=unpause_physics)
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    logger.info("S3 bucket: %s" % args.s3_bucket)
    logger.info("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        utils.log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket,
                    args.s3_prefix), utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        utils.log_and_exit(
            "Failed to import user's reward_function: {}".format(e),
            utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera()

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        logger.info("SageMaker hyperparameters not found.")

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        'car_ctrl_cnfig': {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value:
        rospy.get_param('AWS_REGION'),
        MetricsS3Keys.STEP_BUCKET.value:
        rospy.get_param('SAGEMAKER_SHARED_S3_BUCKET'),
        MetricsS3Keys.STEP_KEY.value:
        os.path.join(rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX'),
                     TRAINING_SIMTRACE_DATA_S3_OBJECT_KEY)
    }

    agent_list = list()
    agent_list.append(
        create_rollout_agent(agent_config, TrainingMetrics(metrics_s3_config)))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(sm_hyperparams_dict, agent_list)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   data_store=data_store,
                   num_workers=args.num_workers,
                   task_parameters=task_parameters)
Example #8
0
def main():
    """ Main function for tournament worker """
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--preset',
                        help="(string) Name of a preset to run \
                             (class name from the 'presets' directory.)",
                        type=str,
                        required=False)
    parser.add_argument('--s3_bucket',
                        help='list(string) S3 bucket',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_BUCKET",
                                                ["gsaur-test"]))
    parser.add_argument('--s3_prefix',
                        help='list(string) S3 prefix',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_PREFIX",
                                                ["sagemaker"]))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_TRIALS", 10)))
    parser.add_argument(
        '-c',
        '--local_model_directory',
        help='(string) Path to a folder containing a checkpoint \
                             to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 2.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "EVALUATION"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      2.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      5.0)))

    args = parser.parse_args()
    arg_s3_bucket = args.s3_bucket
    arg_s3_prefix = args.s3_prefix
    logger.info("S3 bucket: %s \n S3 prefix: %s", arg_s3_bucket, arg_s3_prefix)

    # tournament_worker: names to be displayed in MP4.
    # This is racer alias in tournament worker case.
    display_names = rospy.get_param('DISPLAY_NAME', "")

    metrics_s3_buckets = rospy.get_param('METRICS_S3_BUCKET')
    metrics_s3_object_keys = rospy.get_param('METRICS_S3_OBJECT_KEY')

    arg_s3_bucket, arg_s3_prefix = utils.force_list(
        arg_s3_bucket), utils.force_list(arg_s3_prefix)
    metrics_s3_buckets = utils.force_list(metrics_s3_buckets)
    metrics_s3_object_keys = utils.force_list(metrics_s3_object_keys)

    validate_list = [
        arg_s3_bucket, arg_s3_prefix, metrics_s3_buckets,
        metrics_s3_object_keys
    ]

    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None)
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        simtrace_s3_bucket = utils.force_list(simtrace_s3_bucket)
        simtrace_s3_object_prefix = utils.force_list(simtrace_s3_object_prefix)
        validate_list.extend([simtrace_s3_bucket, simtrace_s3_object_prefix])
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')
        mp4_s3_bucket = utils.force_list(mp4_s3_bucket)
        mp4_s3_object_prefix = utils.force_list(mp4_s3_object_prefix)
        validate_list.extend([mp4_s3_bucket, mp4_s3_object_prefix])

    if not all([lambda x: len(x) == len(validate_list[0]), validate_list]):
        utils.log_and_exit(
            "Eval worker error: Incorrect arguments passed: {}".format(
                validate_list), utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_500)
    if args.number_of_resets != 0 and args.number_of_resets < MIN_RESET_COUNT:
        raise GenericRolloutException(
            "number of resets is less than {}".format(MIN_RESET_COUNT))

    # Instantiate Cameras
    if len(arg_s3_bucket) == 1:
        configure_camera(namespaces=['racecar'])
    else:
        configure_camera(namespaces=[
            'racecar_{}'.format(str(agent_index))
            for agent_index in range(len(arg_s3_bucket))
        ])

    agent_list = list()
    s3_bucket_dict = dict()
    s3_prefix_dict = dict()
    s3_writers = list()

    # tournament_worker: list of required S3 locations
    simtrace_s3_bucket_dict = dict()
    simtrace_s3_prefix_dict = dict()
    metrics_s3_bucket_dict = dict()
    metrics_s3_obect_key_dict = dict()
    mp4_s3_bucket_dict = dict()
    mp4_s3_object_prefix_dict = dict()

    for agent_index, s3_bucket_val in enumerate(arg_s3_bucket):
        agent_name = 'agent' if len(arg_s3_bucket) == 1 else 'agent_{}'.format(
            str(agent_index))
        racecar_name = 'racecar' if len(
            arg_s3_bucket) == 1 else 'racecar_{}'.format(str(agent_index))
        s3_bucket_dict[agent_name] = arg_s3_bucket[agent_index]
        s3_prefix_dict[agent_name] = arg_s3_prefix[agent_index]

        # tournament_worker: remap key with agent_name instead of agent_index for list of S3 locations.
        simtrace_s3_bucket_dict[agent_name] = simtrace_s3_bucket[agent_index]
        simtrace_s3_prefix_dict[agent_name] = simtrace_s3_object_prefix[
            agent_index]
        metrics_s3_bucket_dict[agent_name] = metrics_s3_buckets[agent_index]
        metrics_s3_obect_key_dict[agent_name] = metrics_s3_object_keys[
            agent_index]
        mp4_s3_bucket_dict[agent_name] = mp4_s3_bucket[agent_index]
        mp4_s3_object_prefix_dict[agent_name] = mp4_s3_object_prefix[
            agent_index]

        s3_client = SageS3Client(bucket=arg_s3_bucket[agent_index],
                                 s3_prefix=arg_s3_prefix[agent_index],
                                 aws_region=args.aws_region)

        # Load the model metadata
        if not os.path.exists(os.path.join(CUSTOM_FILES_PATH, agent_name)):
            os.makedirs(os.path.join(CUSTOM_FILES_PATH, agent_name))
        model_metadata_local_path = os.path.join(
            os.path.join(CUSTOM_FILES_PATH, agent_name), 'model_metadata.json')
        utils.load_model_metadata(
            s3_client,
            os.path.normpath("%s/model/model_metadata.json" %
                             arg_s3_prefix[agent_index]),
            model_metadata_local_path)
        # Handle backward compatibility
        _, _, version = parse_model_metadata(model_metadata_local_path)
        if float(version) < float(utils.SIMAPP_VERSION) and \
        not utils.has_current_ckpnt_name(arg_s3_bucket[agent_index], arg_s3_prefix[agent_index], args.aws_region):
            utils.make_compatible(arg_s3_bucket[agent_index],
                                  arg_s3_prefix[agent_index], args.aws_region,
                                  SyncFiles.TRAINER_READY.value)

        # Select the optimal model
        utils.do_model_selection(s3_bucket=arg_s3_bucket[agent_index],
                                 s3_prefix=arg_s3_prefix[agent_index],
                                 region=args.aws_region)

        # Download hyperparameters from SageMaker
        if not os.path.exists(agent_name):
            os.makedirs(agent_name)
        hyperparameters_file_success = False
        hyperparams_s3_key = os.path.normpath(arg_s3_prefix[agent_index] +
                                              "/ip/hyperparameters.json")
        hyperparameters_file_success = s3_client.download_file(
            s3_key=hyperparams_s3_key,
            local_path=os.path.join(agent_name, "hyperparameters.json"))
        sm_hyperparams_dict = {}
        if hyperparameters_file_success:
            logger.info("Received Sagemaker hyperparameters successfully!")
            with open(os.path.join(agent_name,
                                   "hyperparameters.json")) as file:
                sm_hyperparams_dict = json.load(file)
        else:
            logger.info("SageMaker hyperparameters not found.")

        agent_config = {
            'model_metadata': model_metadata_local_path,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [
                    link_name.replace('racecar', racecar_name)
                    for link_name in LINK_NAMES
                ],
                ConfigParams.VELOCITY_LIST.value: [
                    velocity_topic.replace('racecar', racecar_name)
                    for velocity_topic in VELOCITY_TOPICS
                ],
                ConfigParams.STEERING_LIST.value: [
                    steering_topic.replace('racecar', racecar_name)
                    for steering_topic in STEERING_TOPICS
                ],
                ConfigParams.CHANGE_START.value:
                utils.str2bool(rospy.get_param('CHANGE_START_POSITION',
                                               False)),
                ConfigParams.ALT_DIR.value:
                utils.str2bool(
                    rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
                ConfigParams.ACTION_SPACE_PATH.value:
                'custom_files/' + agent_name + '/model_metadata.json',
                ConfigParams.REWARD.value:
                reward_function,
                ConfigParams.AGENT_NAME.value:
                racecar_name,
                ConfigParams.VERSION.value:
                version,
                ConfigParams.NUMBER_OF_RESETS.value:
                args.number_of_resets,
                ConfigParams.PENALTY_SECONDS.value:
                args.penalty_seconds,
                ConfigParams.NUMBER_OF_TRIALS.value:
                args.number_of_trials,
                ConfigParams.IS_CONTINUOUS.value:
                args.is_continuous,
                ConfigParams.RACE_TYPE.value:
                args.race_type,
                ConfigParams.COLLISION_PENALTY.value:
                args.collision_penalty,
                ConfigParams.OFF_TRACK_PENALTY.value:
                args.off_track_penalty
            }
        }

        metrics_s3_config = {
            MetricsS3Keys.METRICS_BUCKET.value:
            metrics_s3_buckets[agent_index],
            MetricsS3Keys.METRICS_KEY.value:
            metrics_s3_object_keys[agent_index],
            # Replaced rospy.get_param('AWS_REGION') to be equal to the argument being passed
            # or default argument set
            MetricsS3Keys.REGION.value:
            args.aws_region,
            # Replaced rospy.get_param('MODEL_S3_BUCKET') to be equal to the argument being passed
            # or default argument set
            MetricsS3Keys.STEP_BUCKET.value:
            arg_s3_bucket[agent_index],
            # Replaced rospy.get_param('MODEL_S3_PREFIX') to be equal to the argument being passed
            # or default argument set
            MetricsS3Keys.STEP_KEY.value:
            os.path.join(arg_s3_prefix[agent_index],
                         EVALUATION_SIMTRACE_DATA_S3_OBJECT_KEY)
        }
        aws_region = rospy.get_param('AWS_REGION', args.aws_region)
        s3_writer_job_info = []
        if simtrace_s3_bucket:
            s3_writer_job_info.append(
                IterationData(
                    'simtrace', simtrace_s3_bucket[agent_index],
                    simtrace_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        SIM_TRACE_EVALUATION_LOCAL_FILE.value)))
        if mp4_s3_bucket:
            s3_writer_job_info.extend([
                IterationData(
                    'pip', mp4_s3_bucket[agent_index],
                    mp4_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)),
                IterationData(
                    '45degree', mp4_s3_bucket[agent_index],
                    mp4_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)),
                IterationData(
                    'topview', mp4_s3_bucket[agent_index],
                    mp4_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value))
            ])

        s3_writers.append(S3Writer(job_info=s3_writer_job_info))
        run_phase_subject = RunPhaseSubject()
        agent_list.append(
            create_rollout_agent(agent_config,
                                 EvalMetrics(agent_name, metrics_s3_config),
                                 run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=run_phase_subject)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names=s3_bucket_dict,
        base_checkpoint_dir=args.local_model_directory,
        s3_folders=s3_prefix_dict)

    graph_manager.data_store = S3BotoDataStore(params=ds_params_instance,
                                               graph_manager=graph_manager,
                                               ignore_lock=True)
    graph_manager.env_params.seed = 0

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.local_model_directory

    tournament_worker(graph_manager=graph_manager,
                      number_of_trials=args.number_of_trials,
                      task_parameters=task_parameters,
                      s3_writers=s3_writers,
                      is_continuous=args.is_continuous)

    # tournament_worker: write race report to local file.
    write_race_report(graph_manager,
                      model_s3_bucket_map=s3_bucket_dict,
                      model_s3_prefix_map=s3_prefix_dict,
                      metrics_s3_bucket_map=metrics_s3_bucket_dict,
                      metrics_s3_key_map=metrics_s3_obect_key_dict,
                      simtrace_s3_bucket_map=simtrace_s3_bucket_dict,
                      simtrace_s3_prefix_map=simtrace_s3_prefix_dict,
                      mp4_s3_bucket_map=mp4_s3_bucket_dict,
                      mp4_s3_prefix_map=mp4_s3_object_prefix_dict,
                      display_names=display_names)

    # tournament_worker: terminate tournament_race_node.
    terminate_tournament_race()
def main():
    screen.set_use_colors(False)

    try:
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '-pk',
            '--preset_s3_key',
            help="(string) Name of a preset to download from S3",
            type=str,
            required=False)
        parser.add_argument(
            '-ek',
            '--environment_s3_key',
            help="(string) Name of an environment file to download from S3",
            type=str,
            required=False)
        parser.add_argument('--model_metadata_s3_key',
                            help="(string) Model Metadata File S3 Key",
                            type=str,
                            required=False)
        parser.add_argument(
            '-c',
            '--checkpoint-dir',
            help=
            '(string) Path to a folder containing a checkpoint to write the model to.',
            type=str,
            default='./checkpoint')
        parser.add_argument(
            '--pretrained-checkpoint-dir',
            help=
            '(string) Path to a folder for downloading a pre-trained model',
            type=str,
            default=PRETRAINED_MODEL_DIR)
        parser.add_argument('--s3_bucket',
                            help='(string) S3 bucket',
                            type=str,
                            default=os.environ.get(
                                "SAGEMAKER_SHARED_S3_BUCKET_PATH",
                                "gsaur-test"))
        parser.add_argument('--s3_prefix',
                            help='(string) S3 prefix',
                            type=str,
                            default='sagemaker')
        parser.add_argument('--framework',
                            help='(string) tensorflow or mxnet',
                            type=str,
                            default='tensorflow')
        parser.add_argument('--pretrained_s3_bucket',
                            help='(string) S3 bucket for pre-trained model',
                            type=str)
        parser.add_argument('--pretrained_s3_prefix',
                            help='(string) S3 prefix for pre-trained model',
                            type=str,
                            default='sagemaker')
        parser.add_argument('--aws_region',
                            help='(string) AWS region',
                            type=str,
                            default=os.environ.get("AWS_REGION", "us-east-1"))

        args, unknown = parser.parse_known_args()
        start_redis_server()

        s3_client = SageS3Client(bucket=args.s3_bucket,
                                 s3_prefix=args.s3_prefix,
                                 aws_region=args.aws_region)

        # Load the model metadata
        model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                                 'model_metadata.json')
        load_model_metadata(s3_client, args.model_metadata_s3_key,
                            model_metadata_local_path)
        s3_client.upload_file(
            os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
            model_metadata_local_path)
        shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

        # Register the gym enviroment, this will give clients the ability to creat the enviroment object
        register(id=defaults.ENV_ID,
                 entry_point=defaults.ENTRY_POINT,
                 max_episode_steps=defaults.MAX_STEPS,
                 reward_threshold=defaults.THRESHOLD)

        user_batch_size, user_episode_per_rollout = None, None
        success_custom_preset = False
        if args.preset_s3_key:
            preset_local_path = "./markov/presets/preset.py"
            success_custom_preset = s3_client.download_file(
                s3_key=args.preset_s3_key, local_path=preset_local_path)
            if not success_custom_preset:
                logger.info(
                    "Could not download the preset file. Using the default DeepRacer preset."
                )
            else:
                preset_location = "markov.presets.preset:graph_manager"
                graph_manager = short_dynamic_import(preset_location,
                                                     ignore_module_case=True)
                success_custom_preset = s3_client.upload_file(
                    s3_key=os.path.normpath("%s/presets/preset.py" %
                                            args.s3_prefix),
                    local_path=preset_local_path)
                if success_custom_preset:
                    agent_param_loc = "markov.presets.preset:agent_params"
                    agent_params = short_dynamic_import(
                        agent_param_loc, ignore_module_case=True)
                    user_batch_size = agent_params.network_wrappers[
                        'main'].batch_size
                    user_episode_per_rollout = agent_params.algorithm.num_consecutive_playing_steps.num_steps
                    logger.info("Using preset: %s" % args.preset_s3_key)

        if not success_custom_preset:
            from markov.sagemaker_graph_manager import get_graph_manager
            user_batch_size = json.loads(
                robomaker_hyperparams_json)["batch_size"],
            user_episode_per_rollout = json.loads(
                robomaker_hyperparams_json)["num_episodes_between_training"]
            params_blob = os.environ.get('SM_TRAINING_ENV', '')
            if params_blob:
                params = json.loads(params_blob)
                sm_hyperparams_dict = params["hyperparameters"]
            else:
                sm_hyperparams_dict = {}
            graph_manager, robomaker_hyperparams_json = get_graph_manager(
                **sm_hyperparams_dict)
            s3_client.upload_hyperparameters(robomaker_hyperparams_json)
            logger.info("Uploaded hyperparameters.json to S3")

        host_ip_address = get_ip_from_host()
        s3_client.write_ip_config(host_ip_address)
        logger.info("Uploaded IP address information to S3: %s" %
                    host_ip_address)
        use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
        if use_pretrained_model:
            s3_client_pretrained = SageS3Client(
                bucket=args.pretrained_s3_bucket,
                s3_prefix=args.pretrained_s3_prefix,
                aws_region=args.aws_region)
            s3_client_pretrained.download_model(args.pretrained_checkpoint_dir)

        memory_backend_params = RedisPubSubMemoryBackendParameters(
            redis_address="localhost",
            redis_port=6379,
            run_type='trainer',
            channel=args.s3_prefix)

        ds_params_instance = S3BotoDataStoreParameters(
            bucket_name=args.s3_bucket,
            checkpoint_dir=args.checkpoint_dir,
            aws_region=args.aws_region,
            s3_folder=args.s3_prefix)
        graph_manager.data_store_params = ds_params_instance

        data_store = S3BotoDataStore(ds_params_instance)
        data_store.graph_manager = graph_manager
        graph_manager.data_store = data_store

        training_worker(graph_manager=graph_manager,
                        checkpoint_dir=args.checkpoint_dir,
                        use_pretrained_model=use_pretrained_model,
                        framework=args.framework,
                        memory_backend_params=memory_backend_params,
                        user_batch_size=user_batch_size,
                        user_episode_per_rollout=user_episode_per_rollout)
    except Exception as ex:
        utils.json_format_logger(
            "Training worker exited with exception: {}".format(ex),
            **utils.build_system_error_dict(
                utils.SIMAPP_TRAINING_WORKER_EXCEPTION,
                utils.SIMAPP_EVENT_ERROR_CODE_500))
        utils.simapp_exit_gracefully()
Example #10
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET",
                                               "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX",
                                               "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=os.environ.get("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=os.environ.get("MODEL_METADATA_FILE_S3_KEY",
                                               None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    print("S3 bucket: %s" % args.s3_bucket)
    print("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)

    redis_ip = s3_client.get_ip()
    print("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        print("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        print("SageMaker hyperparameters not found.")

    preset_file_success = False
    environment_file_success = False
    preset_file_success, environment_file_success = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    if not environment_file_success:
        # Download reward function if environment file is not downloaded
        if not args.reward_file_s3_key:
            raise ValueError("Customer reward S3 key not supplied!")
        download_customer_reward_function(s3_client, args.reward_file_s3_key)
        import markov.environments
        print("Using default environment!")
    else:
        register_custom_environments()
        print("Using custom environment!")

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        print("Using custom preset file!")
    else:
        from markov.sagemaker_graph_manager import get_graph_manager
        graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type='worker',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    rollout_worker(
        graph_manager=graph_manager,
        checkpoint_dir=args.checkpoint_dir,
        data_store=data_store,
        num_workers=args.num_workers,
    )
Example #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--preset',
                        help="(string) Name of a preset to run (class name from the 'presets' directory.)",
                        type=str,
                        required=False)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("MODEL_S3_BUCKET", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("MODEL_S3_PREFIX", "sagemaker"))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=os.environ.get("NUMBER_OF_TRIALS", 10))
    parser.add_argument('-c', '--local_model_directory',
                        help='(string) Path to a folder containing a checkpoint to restore the model from.',
                        type=str,
                        default='./checkpoint')

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region)
    print("S3 bucket: %s" % args.s3_bucket)
    print("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json')
    load_model_metadata(s3_client, os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path)

    # Download the model
    s3_client.download_model(args.local_model_directory)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix + "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(s3_key=hyperparams_s3_key,
                                                           local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        print("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        print("SageMaker hyperparameters not found.")

    from markov.sagemaker_graph_manager import get_graph_manager
    graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    ds_params_instance = S3BotoDataStoreParameters(bucket_name=args.s3_bucket,
                                                   checkpoint_dir=args.local_model_directory,
                                                   aws_region=args.aws_region,
                                                   s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    graph_manager.data_store = data_store

    evaluation_worker(
        graph_manager=graph_manager,
        number_of_trials=args.number_of_trials,
        local_model_directory=args.local_model_directory
    )
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET",
                                               "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX",
                                               "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=os.environ.get("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=os.environ.get("MODEL_METADATA_FILE_S3_KEY",
                                               None))
    parser.add_argument('--aws_endpoint_url',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_ENDPOINT_URL", None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region,
                             endpoint_url=args.aws_endpoint_url)
    logger.info("S3 bucket: %s" % args.s3_bucket)
    logger.info("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)

    # Download reward function
    if not args.reward_file_s3_key:
        utils.json_format_logger(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            **utils.build_system_error_dict(
                utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
                utils.SIMAPP_EVENT_ERROR_CODE_500))
        traceback.print_exc()
        utils.simapp_exit_gracefully()
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    # Register the gym enviroment, this will give clients the ability to creat the enviroment object
    register(id=defaults.ENV_ID,
             entry_point=defaults.ENTRY_POINT,
             max_episode_steps=defaults.MAX_STEPS,
             reward_threshold=defaults.THRESHOLD)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        logger.info("SageMaker hyperparameters not found.")

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        from markov.sagemaker_graph_manager import get_graph_manager
        graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    logger.info("Connecting to redis at %s:%d" % (redis_ip, args.redis_port))
    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type='worker',
        channel=args.s3_prefix)

    logger.info("Connecting to s3 boto data store at %s" %
                args.aws_endpoint_url)
    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix,
        aws_endpoint_url=args.aws_endpoint_url)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    rollout_worker(graph_manager=graph_manager,
                   checkpoint_dir=args.checkpoint_dir,
                   data_store=data_store,
                   num_workers=args.num_workers,
                   memory_backend_params=memory_backend_params)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--preset',
                        help="(string) Name of a preset to run \
                             (class name from the 'presets' directory.)",
                        type=str,
                        required=False)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("MODEL_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("MODEL_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_TRIALS", 10)))
    parser.add_argument(
        '-c',
        '--local_model_directory',
        help='(string) Path to a folder containing a checkpoint \
                             to restore the model from.',
        type=str,
        default='./checkpoint')

    args = parser.parse_args()
    logger.info("S3 bucket: %s \n S3 prefix: %s", args.s3_bucket,
                args.s3_prefix)

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(
        s3_client,
        os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
        model_metadata_local_path)
    # Handle backward compatibility
    _, _, version = parse_model_metadata(model_metadata_local_path)
    if float(version) < float(utils.SIMAPP_VERSION) and \
    not utils.has_current_ckpnt_name(args.s3_bucket, args.s3_prefix, args.aws_region):
        utils.make_compatible(args.s3_bucket, args.s3_prefix, args.aws_region,
                              SyncFiles.TRAINER_READY.value)
    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as file:
            sm_hyperparams_dict = json.load(file)
    else:
        logger.info("SageMaker hyperparameters not found.")

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        'car_ctrl_cnfig': {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', False)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value:
        rospy.get_param('AWS_REGION'),
        MetricsS3Keys.STEP_BUCKET.value:
        rospy.get_param('MODEL_S3_BUCKET'),
        MetricsS3Keys.STEP_KEY.value:
        os.path.join(rospy.get_param('MODEL_S3_PREFIX'),
                     EVALUATION_SIMTRACE_DATA_S3_OBJECT_KEY)
    }

    agent_list = list()
    agent_list.append(
        create_rollout_agent(agent_config, EvalMetrics(metrics_s3_config)))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    graph_manager, _ = get_graph_manager(sm_hyperparams_dict, agent_list)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.local_model_directory,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store
    graph_manager.env_params.seed = 0

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.local_model_directory

    evaluation_worker(
        graph_manager=graph_manager,
        data_store=data_store,
        number_of_trials=args.number_of_trials,
        task_parameters=task_parameters,
    )
Example #14
0
def main():
    """ Main function for evaluation worker """
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--preset',
                        help="(string) Name of a preset to run \
                             (class name from the 'presets' directory.)",
                        type=str,
                        required=False)
    parser.add_argument('--s3_bucket',
                        help='list(string) S3 bucket',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_BUCKET",
                                                ["gsaur-test"]))
    parser.add_argument('--s3_prefix',
                        help='list(string) S3 prefix',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_PREFIX",
                                                ["sagemaker"]))
    parser.add_argument('--s3_endpoint_url',
                        help='(string) S3 endpoint URL',
                        type=str,
                        default=rospy.get_param("S3_ENDPOINT_URL", None))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_TRIALS", 10)))
    parser.add_argument(
        '-c',
        '--local_model_directory',
        help='(string) Path to a folder containing a checkpoint \
                             to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 2.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "EVALUATION"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      2.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      5.0)))
    parser.add_argument('--round_robin_advance_dist',
                        help='(float) round robin distance 0-1',
                        type=float,
                        default=float(
                            rospy.get_param("ROUND_ROBIN_ADVANCE_DIST", 0.05)))
    parser.add_argument('--start_position_offset',
                        help='(float) offset start 0-1',
                        type=float,
                        default=float(
                            rospy.get_param("START_POSITION_OFFSET", 0.0)))

    args = parser.parse_args()
    arg_s3_bucket = args.s3_bucket
    arg_s3_prefix = args.s3_prefix
    logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s",
                args.s3_bucket, args.s3_prefix, args.s3_endpoint_url)

    metrics_s3_buckets = rospy.get_param('METRICS_S3_BUCKET')
    metrics_s3_object_keys = rospy.get_param('METRICS_S3_OBJECT_KEY')

    arg_s3_bucket, arg_s3_prefix = utils.force_list(
        arg_s3_bucket), utils.force_list(arg_s3_prefix)
    metrics_s3_buckets = utils.force_list(metrics_s3_buckets)
    metrics_s3_object_keys = utils.force_list(metrics_s3_object_keys)

    validate_list = [
        arg_s3_bucket, arg_s3_prefix, metrics_s3_buckets,
        metrics_s3_object_keys
    ]

    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None)
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        simtrace_s3_bucket = utils.force_list(simtrace_s3_bucket)
        simtrace_s3_object_prefix = utils.force_list(simtrace_s3_object_prefix)
        validate_list.extend([simtrace_s3_bucket, simtrace_s3_object_prefix])
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')
        mp4_s3_bucket = utils.force_list(mp4_s3_bucket)
        mp4_s3_object_prefix = utils.force_list(mp4_s3_object_prefix)
        validate_list.extend([mp4_s3_bucket, mp4_s3_object_prefix])

    if not all([lambda x: len(x) == len(validate_list[0]), validate_list]):
        log_and_exit(
            "Eval worker error: Incorrect arguments passed: {}".format(
                validate_list), SIMAPP_SIMULATION_WORKER_EXCEPTION,
            SIMAPP_EVENT_ERROR_CODE_500)
    if args.number_of_resets != 0 and args.number_of_resets < MIN_RESET_COUNT:
        raise GenericRolloutException(
            "number of resets is less than {}".format(MIN_RESET_COUNT))

    # Instantiate Cameras
    if len(arg_s3_bucket) == 1:
        configure_camera(namespaces=['racecar'])
    else:
        configure_camera(namespaces=[
            'racecar_{}'.format(str(agent_index))
            for agent_index in range(len(arg_s3_bucket))
        ])

    agent_list = list()
    s3_bucket_dict = dict()
    s3_prefix_dict = dict()
    s3_writers = list()
    start_positions = get_start_positions(len(arg_s3_bucket))
    done_condition = utils.str_to_done_condition(
        rospy.get_param("DONE_CONDITION", any))
    park_positions = utils.pos_2d_str_to_list(
        rospy.get_param("PARK_POSITIONS", []))
    # if not pass in park positions for all done condition case, use default
    if not park_positions:
        park_positions = [DEFAULT_PARK_POSITION for _ in arg_s3_bucket]
    for agent_index, _ in enumerate(arg_s3_bucket):
        agent_name = 'agent' if len(arg_s3_bucket) == 1 else 'agent_{}'.format(
            str(agent_index))
        racecar_name = 'racecar' if len(
            arg_s3_bucket) == 1 else 'racecar_{}'.format(str(agent_index))
        s3_bucket_dict[agent_name] = arg_s3_bucket[agent_index]
        s3_prefix_dict[agent_name] = arg_s3_prefix[agent_index]

        # download model metadata
        model_metadata = ModelMetadata(
            bucket=arg_s3_bucket[agent_index],
            s3_key=get_s3_key(arg_s3_prefix[agent_index],
                              MODEL_METADATA_S3_POSTFIX),
            region_name=args.aws_region,
            s3_endpoint_url=args.s3_endpoint_url,
            local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(agent_name))
        _, _, version = model_metadata.get_model_metadata_info()

        # Select the optimal model
        utils.do_model_selection(s3_bucket=arg_s3_bucket[agent_index],
                                 s3_prefix=arg_s3_prefix[agent_index],
                                 region=args.aws_region,
                                 s3_endpoint_url=args.s3_endpoint_url)

        agent_config = {
            'model_metadata': model_metadata,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [
                    link_name.replace('racecar', racecar_name)
                    for link_name in LINK_NAMES
                ],
                ConfigParams.VELOCITY_LIST.value: [
                    velocity_topic.replace('racecar', racecar_name)
                    for velocity_topic in VELOCITY_TOPICS
                ],
                ConfigParams.STEERING_LIST.value: [
                    steering_topic.replace('racecar', racecar_name)
                    for steering_topic in STEERING_TOPICS
                ],
                ConfigParams.CHANGE_START.value:
                utils.str2bool(rospy.get_param('CHANGE_START_POSITION',
                                               False)),
                ConfigParams.ALT_DIR.value:
                utils.str2bool(
                    rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
                ConfigParams.ACTION_SPACE_PATH.value:
                model_metadata.local_path,
                ConfigParams.REWARD.value:
                reward_function,
                ConfigParams.AGENT_NAME.value:
                racecar_name,
                ConfigParams.VERSION.value:
                version,
                ConfigParams.NUMBER_OF_RESETS.value:
                args.number_of_resets,
                ConfigParams.PENALTY_SECONDS.value:
                args.penalty_seconds,
                ConfigParams.NUMBER_OF_TRIALS.value:
                args.number_of_trials,
                ConfigParams.IS_CONTINUOUS.value:
                args.is_continuous,
                ConfigParams.RACE_TYPE.value:
                args.race_type,
                ConfigParams.COLLISION_PENALTY.value:
                args.collision_penalty,
                ConfigParams.OFF_TRACK_PENALTY.value:
                args.off_track_penalty,
                ConfigParams.START_POSITION.value:
                start_positions[agent_index],
                ConfigParams.DONE_CONDITION.value:
                done_condition,
                ConfigParams.ROUND_ROBIN_ADVANCE_DIST.value:
                args.round_robin_advance_dist,
                ConfigParams.START_POSITION_OFFSET.value:
                args.start_position_offset
            }
        }

        metrics_s3_config = {
            MetricsS3Keys.METRICS_BUCKET.value:
            metrics_s3_buckets[agent_index],
            MetricsS3Keys.METRICS_KEY.value:
            metrics_s3_object_keys[agent_index],
            MetricsS3Keys.ENDPOINT_URL.value:
            rospy.get_param('S3_ENDPOINT_URL', None),
            # Replaced rospy.get_param('AWS_REGION') to be equal to the argument being passed
            # or default argument set
            MetricsS3Keys.REGION.value:
            args.aws_region
        }
        aws_region = rospy.get_param('AWS_REGION', args.aws_region)
        s3_writer_job_info = []
        if simtrace_s3_bucket:
            s3_writer_job_info.append(
                IterationData(
                    'simtrace', simtrace_s3_bucket[agent_index],
                    simtrace_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        SIM_TRACE_EVALUATION_LOCAL_FILE.value)))
        if mp4_s3_bucket:
            s3_writer_job_info.extend([
                IterationData(
                    'pip', mp4_s3_bucket[agent_index],
                    mp4_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)),
                IterationData(
                    '45degree', mp4_s3_bucket[agent_index],
                    mp4_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)),
                IterationData(
                    'topview', mp4_s3_bucket[agent_index],
                    mp4_s3_object_prefix[agent_index], aws_region,
                    os.path.join(
                        ITERATION_DATA_LOCAL_FILE_PATH, agent_name,
                        IterationDataLocalFileNames.
                        CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value))
            ])

        s3_writers.append(
            S3Writer(job_info=s3_writer_job_info,
                     s3_endpoint_url=args.s3_endpoint_url))
        run_phase_subject = RunPhaseSubject()
        agent_list.append(
            create_rollout_agent(
                agent_config,
                EvalMetrics(agent_name, metrics_s3_config, args.is_continuous),
                run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)
    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(
        hp_dict=sm_hyperparams_dict,
        agent_list=agent_list,
        run_phase_subject=run_phase_subject,
        enable_domain_randomization=enable_domain_randomization,
        done_condition=done_condition)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names=s3_bucket_dict,
        base_checkpoint_dir=args.local_model_directory,
        s3_folders=s3_prefix_dict,
        s3_endpoint_url=args.s3_endpoint_url)

    graph_manager.data_store = S3BotoDataStore(params=ds_params_instance,
                                               graph_manager=graph_manager,
                                               ignore_lock=True)
    graph_manager.env_params.seed = 0

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.local_model_directory

    evaluation_worker(graph_manager=graph_manager,
                      number_of_trials=args.number_of_trials,
                      task_parameters=task_parameters,
                      s3_writers=s3_writers,
                      is_continuous=args.is_continuous,
                      park_positions=park_positions)
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained_checkpoint_dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()

    s3_client = S3Client(region_name=args.aws_region, max_retry_attempts=0)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata_download = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=args.model_metadata_s3_key,
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_info = model_metadata_download.get_model_metadata_info()
    network_type = model_metadata_info[ModelMetadataKeys.NEURAL_NETWORK.value]
    version = model_metadata_info[ModelMetadataKeys.VERSION.value]

    # upload model metadata
    model_metadata_upload = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX),
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_upload.persist(
        s3_kms_extra_args=utils.get_s3_kms_extra_args())

    shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        try:
            s3_client.download_file(bucket=args.s3_bucket,
                                    s3_key=args.preset_s3_key,
                                    local_path=preset_local_path)
            success_custom_preset = True
        except botocore.exceptions.ClientError:
            pass
        if not success_custom_preset:
            logger.info(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            s3_client.upload_file(
                bucket=args.s3_bucket,
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path,
                s3_kms_extra_args=utils.get_s3_kms_extra_args())
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {
            'model_metadata': model_metadata_download,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [],
                ConfigParams.VELOCITY_LIST.value: {},
                ConfigParams.STEERING_LIST.value: {},
                ConfigParams.CHANGE_START.value: None,
                ConfigParams.ALT_DIR.value: None,
                ConfigParams.MODEL_METADATA.value: model_metadata_download,
                ConfigParams.REWARD.value: None,
                ConfigParams.AGENT_NAME.value: 'racecar'
            }
        }

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=None,
            run_type=str(RunType.TRAINER))

        # Upload hyperparameters to SageMaker shared s3 bucket
        hyperparameters = Hyperparameters(bucket=args.s3_bucket,
                                          s3_key=get_s3_key(
                                              args.s3_prefix,
                                              HYPERPARAMETER_S3_POSTFIX),
                                          region_name=args.aws_region)
        hyperparameters.persist(
            hyperparams_json=robomaker_hyperparams_json,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())

        # Attach sample collector to graph_manager only if sample count > 0
        max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0))
        if max_sample_count > 0:
            sample_collector = SampleCollector(
                bucket=args.s3_bucket,
                s3_prefix=args.s3_prefix,
                region_name=args.aws_region,
                max_sample_count=max_sample_count,
                sampling_frequency=int(
                    sm_hyperparams_dict.get("sampling_frequency", 1)))
            graph_manager.sample_collector = sample_collector

    # persist IP config from sagemaker to s3
    ip_config = IpConfig(bucket=args.s3_bucket,
                         s3_prefix=args.s3_prefix,
                         region_name=args.aws_region)
    ip_config.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args())

    training_algorithm = model_metadata_download.training_algorithm
    output_head_format = FROZEN_HEAD_OUTPUT_GRAPH_FORMAT_MAPPING[
        training_algorithm]

    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    # Handle backward compatibility
    if use_pretrained_model:
        # checkpoint s3 instance for pretrained model
        # TODO: replace 'agent' for multiagent training
        checkpoint = Checkpoint(bucket=args.pretrained_s3_bucket,
                                s3_prefix=args.pretrained_s3_prefix,
                                region_name=args.aws_region,
                                agent_name='agent',
                                checkpoint_dir=args.pretrained_checkpoint_dir,
                                output_head_format=output_head_format)
        # make coach checkpoint compatible
        if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
        ):
            checkpoint.rl_coach_checkpoint.make_compatible(
                checkpoint.syncfile_ready)
        # get best model checkpoint string
        model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        # Select the best checkpoint model by uploading rl coach .coach_checkpoint file
        checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())
        # add checkpoint into checkpoint_dict
        checkpoint_dict = {'agent': checkpoint}
        # load pretrained model
        ds_params_instance_pretrained = S3BotoDataStoreParameters(
            checkpoint_dict=checkpoint_dict)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained,
                                                graph_manager, True)
        data_store_pretrained.load_from_store()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type=str(RunType.TRAINER),
        channel=args.s3_prefix,
        network_type=network_type)

    graph_manager.memory_backend_params = memory_backend_params

    # checkpoint s3 instance for training model
    checkpoint = Checkpoint(bucket=args.s3_bucket,
                            s3_prefix=args.s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=args.checkpoint_dir,
                            output_head_format=output_head_format)
    checkpoint_dict = {'agent': checkpoint}
    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store_params = ds_params_instance

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(
            robomaker_hyperparams_json)["num_episodes_between_training"],
        training_algorithm=training_algorithm)
Example #16
0
def validate(s3_bucket, s3_prefix, aws_region):
    screen.set_use_colors(False)
    screen.log_title(" S3 bucket: {} \n S3 prefix: {}".format(
        s3_bucket, s3_prefix))

    # download model metadata
    model_metadata = ModelMetadata(bucket=s3_bucket,
                                   s3_key=get_s3_key(
                                       s3_prefix, MODEL_METADATA_S3_POSTFIX),
                                   region_name=aws_region,
                                   local_path=MODEL_METADATA_LOCAL_PATH)

    # Create model local path
    os.makedirs(LOCAL_MODEL_DIR)

    try:
        # Handle backward compatibility
        model_metadata_info = model_metadata.get_model_metadata_info()
        observation_list = model_metadata_info[ModelMetadataKeys.SENSOR.value]
        version = model_metadata_info[ModelMetadataKeys.VERSION.value]
    except Exception as ex:
        log_and_exit("Failed to parse model_metadata file: {}".format(ex),
                     SIMAPP_VALIDATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Below get_transition_data function must called before create_training_agent function
    # to avoid 500 in case unsupported Sensor is received.
    # create_training_agent will exit with 500 if unsupported sensor is received,
    # and get_transition_data function below will exit with 400 if unsupported sensor is received.
    # We want to return 400 in model validation case if unsupported sensor is received.
    # Thus, call this get_transition_data function before create_traning_agent function!
    transitions = get_transition_data(observation_list)

    checkpoint = Checkpoint(bucket=s3_bucket,
                            s3_prefix=s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=LOCAL_MODEL_DIR)
    # make coach checkpoint compatible
    if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
    ):
        checkpoint.rl_coach_checkpoint.make_compatible(
            checkpoint.syncfile_ready)
    # add checkpoint into checkpoint_dict
    checkpoint_dict = {'agent': checkpoint}

    agent_config = {
        'model_metadata': model_metadata,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value: [],
            ConfigParams.VELOCITY_LIST.value: {},
            ConfigParams.STEERING_LIST.value: {},
            ConfigParams.CHANGE_START.value: None,
            ConfigParams.ALT_DIR.value: None,
            ConfigParams.MODEL_METADATA.value: model_metadata,
            ConfigParams.REWARD.value: None,
            ConfigParams.AGENT_NAME.value: 'racecar'
        }
    }

    agent_list = list()
    agent_list.append(create_training_agent(agent_config))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=None)

    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager,
                                               ignore_lock=True)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = LOCAL_MODEL_DIR
    _validate(graph_manager=graph_manager,
              task_parameters=task_parameters,
              transitions=transitions,
              s3_bucket=s3_bucket,
              s3_prefix=s3_prefix,
              aws_region=aws_region)
Example #17
0
def main():
    """ Main function for evaluation worker """
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--preset',
                        help="(string) Name of a preset to run \
                             (class name from the 'presets' directory.)",
                        type=str,
                        required=False)
    parser.add_argument('--s3_bucket',
                        help='list(string) S3 bucket',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_BUCKET",
                                                ["gsaur-test"]))
    parser.add_argument('--s3_prefix',
                        help='list(string) S3 prefix',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_PREFIX",
                                                ["sagemaker"]))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_TRIALS", 10)))
    parser.add_argument(
        '-c',
        '--local_model_directory',
        help='(string) Path to a folder containing a checkpoint \
                             to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 2.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "EVALUATION"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      2.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      5.0)))

    args = parser.parse_args()
    arg_s3_bucket = args.s3_bucket
    arg_s3_prefix = args.s3_prefix
    logger.info("S3 bucket: %s \n S3 prefix: %s", arg_s3_bucket, arg_s3_prefix)

    metrics_s3_buckets = rospy.get_param('METRICS_S3_BUCKET')
    metrics_s3_object_keys = rospy.get_param('METRICS_S3_OBJECT_KEY')

    arg_s3_bucket, arg_s3_prefix = utils.force_list(
        arg_s3_bucket), utils.force_list(arg_s3_prefix)
    metrics_s3_buckets = utils.force_list(metrics_s3_buckets)
    metrics_s3_object_keys = utils.force_list(metrics_s3_object_keys)

    validate_list = [
        arg_s3_bucket, arg_s3_prefix, metrics_s3_buckets,
        metrics_s3_object_keys
    ]

    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None)
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        simtrace_s3_bucket = utils.force_list(simtrace_s3_bucket)
        simtrace_s3_object_prefix = utils.force_list(simtrace_s3_object_prefix)
        validate_list.extend([simtrace_s3_bucket, simtrace_s3_object_prefix])
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')
        mp4_s3_bucket = utils.force_list(mp4_s3_bucket)
        mp4_s3_object_prefix = utils.force_list(mp4_s3_object_prefix)
        validate_list.extend([mp4_s3_bucket, mp4_s3_object_prefix])

    if not all([lambda x: len(x) == len(validate_list[0]), validate_list]):
        log_and_exit(
            "Eval worker error: Incorrect arguments passed: {}".format(
                validate_list), SIMAPP_SIMULATION_WORKER_EXCEPTION,
            SIMAPP_EVENT_ERROR_CODE_500)
    if args.number_of_resets != 0 and args.number_of_resets < MIN_RESET_COUNT:
        raise GenericRolloutException(
            "number of resets is less than {}".format(MIN_RESET_COUNT))

    # Instantiate Cameras
    if len(arg_s3_bucket) == 1:
        configure_camera(namespaces=['racecar'])
    else:
        configure_camera(namespaces=[
            'racecar_{}'.format(str(agent_index))
            for agent_index in range(len(arg_s3_bucket))
        ])

    agent_list = list()
    s3_bucket_dict = dict()
    s3_prefix_dict = dict()
    checkpoint_dict = dict()
    simtrace_video_s3_writers = []
    start_positions = get_start_positions(len(arg_s3_bucket))
    done_condition = utils.str_to_done_condition(
        rospy.get_param("DONE_CONDITION", any))
    park_positions = utils.pos_2d_str_to_list(
        rospy.get_param("PARK_POSITIONS", []))
    # if not pass in park positions for all done condition case, use default
    if not park_positions:
        park_positions = [DEFAULT_PARK_POSITION for _ in arg_s3_bucket]
    for agent_index, _ in enumerate(arg_s3_bucket):
        agent_name = 'agent' if len(arg_s3_bucket) == 1 else 'agent_{}'.format(
            str(agent_index))
        racecar_name = 'racecar' if len(
            arg_s3_bucket) == 1 else 'racecar_{}'.format(str(agent_index))
        s3_bucket_dict[agent_name] = arg_s3_bucket[agent_index]
        s3_prefix_dict[agent_name] = arg_s3_prefix[agent_index]

        # download model metadata
        model_metadata = ModelMetadata(
            bucket=arg_s3_bucket[agent_index],
            s3_key=get_s3_key(arg_s3_prefix[agent_index],
                              MODEL_METADATA_S3_POSTFIX),
            region_name=args.aws_region,
            local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(agent_name))
        model_metadata_info = model_metadata.get_model_metadata_info()
        version = model_metadata_info[ModelMetadataKeys.VERSION.value]

        # checkpoint s3 instance
        checkpoint = Checkpoint(bucket=arg_s3_bucket[agent_index],
                                s3_prefix=arg_s3_prefix[agent_index],
                                region_name=args.aws_region,
                                agent_name=agent_name,
                                checkpoint_dir=args.local_model_directory)
        # make coach checkpoint compatible
        if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
        ):
            checkpoint.rl_coach_checkpoint.make_compatible(
                checkpoint.syncfile_ready)
        # get best model checkpoint string
        model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        # Select the best checkpoint model by uploading rl coach .coach_checkpoint file
        checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())

        checkpoint_dict[agent_name] = checkpoint

        agent_config = {
            'model_metadata': model_metadata,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [
                    link_name.replace('racecar', racecar_name)
                    for link_name in LINK_NAMES
                ],
                ConfigParams.VELOCITY_LIST.value: [
                    velocity_topic.replace('racecar', racecar_name)
                    for velocity_topic in VELOCITY_TOPICS
                ],
                ConfigParams.STEERING_LIST.value: [
                    steering_topic.replace('racecar', racecar_name)
                    for steering_topic in STEERING_TOPICS
                ],
                ConfigParams.CHANGE_START.value:
                utils.str2bool(rospy.get_param('CHANGE_START_POSITION',
                                               False)),
                ConfigParams.ALT_DIR.value:
                utils.str2bool(
                    rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
                ConfigParams.MODEL_METADATA.value:
                model_metadata,
                ConfigParams.REWARD.value:
                reward_function,
                ConfigParams.AGENT_NAME.value:
                racecar_name,
                ConfigParams.VERSION.value:
                version,
                ConfigParams.NUMBER_OF_RESETS.value:
                args.number_of_resets,
                ConfigParams.PENALTY_SECONDS.value:
                args.penalty_seconds,
                ConfigParams.NUMBER_OF_TRIALS.value:
                args.number_of_trials,
                ConfigParams.IS_CONTINUOUS.value:
                args.is_continuous,
                ConfigParams.RACE_TYPE.value:
                args.race_type,
                ConfigParams.COLLISION_PENALTY.value:
                args.collision_penalty,
                ConfigParams.OFF_TRACK_PENALTY.value:
                args.off_track_penalty,
                ConfigParams.START_POSITION.value:
                start_positions[agent_index],
                ConfigParams.DONE_CONDITION.value:
                done_condition
            }
        }

        metrics_s3_config = {
            MetricsS3Keys.METRICS_BUCKET.value:
            metrics_s3_buckets[agent_index],
            MetricsS3Keys.METRICS_KEY.value:
            metrics_s3_object_keys[agent_index],
            # Replaced rospy.get_param('AWS_REGION') to be equal to the argument being passed
            # or default argument set
            MetricsS3Keys.REGION.value:
            args.aws_region
        }
        aws_region = rospy.get_param('AWS_REGION', args.aws_region)

        if simtrace_s3_bucket:
            simtrace_video_s3_writers.append(
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.SIMTRACE_EVAL.value,
                    bucket=simtrace_s3_bucket[agent_index],
                    s3_prefix=simtrace_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=SIMTRACE_EVAL_LOCAL_PATH_FORMAT.format(
                        agent_name)))
        if mp4_s3_bucket:
            simtrace_video_s3_writers.extend([
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.PIP.value,
                    bucket=mp4_s3_bucket[agent_index],
                    s3_prefix=mp4_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format(
                        agent_name)),
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.DEGREE45.value,
                    bucket=mp4_s3_bucket[agent_index],
                    s3_prefix=mp4_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format(
                        agent_name)),
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.TOPVIEW.value,
                    bucket=mp4_s3_bucket[agent_index],
                    s3_prefix=mp4_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format(
                        agent_name))
            ])

        run_phase_subject = RunPhaseSubject()
        agent_list.append(
            create_rollout_agent(
                agent_config,
                EvalMetrics(agent_name, metrics_s3_config, args.is_continuous),
                run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)
    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))

    sm_hyperparams_dict = {}

    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)

    graph_manager, _ = get_graph_manager(
        hp_dict=sm_hyperparams_dict,
        agent_list=agent_list,
        run_phase_subject=run_phase_subject,
        enable_domain_randomization=enable_domain_randomization,
        done_condition=done_condition,
        pause_physics=pause_physics,
        unpause_physics=unpause_physics)

    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(params=ds_params_instance,
                                               graph_manager=graph_manager,
                                               ignore_lock=True)
    graph_manager.env_params.seed = 0

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.local_model_directory

    evaluation_worker(graph_manager=graph_manager,
                      number_of_trials=args.number_of_trials,
                      task_parameters=task_parameters,
                      simtrace_video_s3_writers=simtrace_video_s3_writers,
                      is_continuous=args.is_continuous,
                      park_positions=park_positions,
                      race_type=args.race_type,
                      pause_physics=pause_physics,
                      unpause_physics=unpause_physics)
Example #18
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, unknown = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)
    s3_client.upload_file(
        os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
        model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_environment = False
    if args.environment_s3_key:
        environment_local_path = "./markov/environments/deepracer_racetrack_env.py"
        success_custom_environment = s3_client.download_file(
            s3_key=args.environment_s3_key, local_path=environment_local_path)
        if not success_custom_environment:
            print(
                "Could not download the environment file. Using the default DeepRacer environment."
            )
        else:
            success_custom_environment = s3_client.upload_file(
                s3_key=os.path.normpath(
                    "%s/environments/deepracer_racetrack_env.py" %
                    args.s3_prefix),
                local_path=environment_local_path)
            if success_custom_environment:
                print("Using environment: %s" % args.environment_s3_key)

    # Import to register the environment with Gym
    import markov.environments

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(
            s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            print(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path)
            if success_custom_preset:
                print("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        from markov.sagemaker_graph_manager import get_graph_manager
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}
        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            **sm_hyperparams_dict)
        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        print("Uploaded hyperparameters.json to S3")

    host_ip_address = get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    print("Uploaded IP address information to S3: %s" % host_ip_address)

    use_pretrained_model = False
    if args.pretrained_s3_bucket and args.pretrained_s3_prefix:
        s3_client_pretrained = SageS3Client(
            bucket=args.pretrained_s3_bucket,
            s3_prefix=args.pretrained_s3_prefix,
            aws_region=args.aws_region)
        s3_client_pretrained.download_model(args.pretrained_checkpoint_dir)
        use_pretrained_model = True

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type='trainer',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    training_worker(graph_manager=graph_manager,
                    checkpoint_dir=args.checkpoint_dir,
                    use_pretrained_model=use_pretrained_model,
                    framework=args.framework)
Example #19
0
def validate(s3_bucket, s3_prefix, aws_region):
    screen.set_use_colors(False)
    screen.log_title(" S3 bucket: {} \n S3 prefix: {}".format(
        s3_bucket, s3_prefix))

    s3_client = SageS3Client(bucket=s3_bucket,
                             s3_prefix=s3_prefix,
                             aws_region=aws_region)
    # Load the model metadata
    utils.load_model_metadata(
        s3_client,
        os.path.normpath("%s/model/model_metadata.json" % s3_prefix),
        MODEL_METADATA_LOCAL_PATH)

    # Create model local path
    os.makedirs(LOCAL_MODEL_DIR)

    try:
        # Handle backward compatibility
        observation_list, _, version = parse_model_metadata(
            MODEL_METADATA_LOCAL_PATH)
    except Exception as ex:
        log_and_exit("Failed to parse model_metadata file: {}".format(ex),
                     SIMAPP_VALIDATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Below get_transition_data function must called before create_training_agent function
    # to avoid 500 in case unsupported Sensor is received.
    # create_training_agent will exit with 500 if unsupported sensor is received,
    # and get_transition_data function below will exit with 400 if unsupported sensor is received.
    # We want to return 400 in model validation case if unsupported sensor is received.
    # Thus, call this get_transition_data function before create_traning_agent function!
    transitions = get_transition_data(observation_list)

    if float(version) < float(SIMAPP_VERSION) and \
            not utils.has_current_ckpnt_name(s3_bucket, s3_prefix, aws_region):
        utils.make_compatible(s3_bucket, s3_prefix, aws_region,
                              SyncFiles.TRAINER_READY.value)

    agent_config = {
        'model_metadata': MODEL_METADATA_LOCAL_PATH,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value: [],
            ConfigParams.VELOCITY_LIST.value: {},
            ConfigParams.STEERING_LIST.value: {},
            ConfigParams.CHANGE_START.value: None,
            ConfigParams.ALT_DIR.value: None,
            ConfigParams.ACTION_SPACE_PATH.value: MODEL_METADATA_LOCAL_PATH,
            ConfigParams.REWARD.value: None,
            ConfigParams.AGENT_NAME.value: 'racecar'
        }
    }

    agent_list = list()
    agent_list.append(create_training_agent(agent_config))

    sm_hyperparams_dict = {}
    graph_manager, _ = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                         agent_list=agent_list,
                                         run_phase_subject=None)

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=aws_region,
        bucket_names={'agent': s3_bucket},
        s3_folders={'agent': s3_prefix},
        base_checkpoint_dir=LOCAL_MODEL_DIR)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager,
                                               ignore_lock=True)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = LOCAL_MODEL_DIR
    _validate(graph_manager=graph_manager,
              task_parameters=task_parameters,
              transitions=transitions,
              s3_bucket=s3_bucket,
              s3_prefix=s3_prefix,
              aws_region=aws_region)