def evaluation_worker(graph_manager, data_store, number_of_trials,
                      task_parameters):
    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    # Instantiate Cameras
    configure_camera()

    unpause_physics(EmptyRequest())
    graph_manager.reset_internal_state(True)
    for _ in range(number_of_trials):
        graph_manager.evaluate(EnvironmentSteps(1))

    # Close the down the job
    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))
Example #2
0
def main():
    """ Main function for tournament"""
    try:
        # parse argument
        s3_region = sys.argv[1]
        s3_bucket = sys.argv[2]
        s3_prefix = sys.argv[3]
        s3_yaml_name = sys.argv[4]

        # create boto3 session/client and download yaml/json file
        session = boto3.session.Session()
        s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None)
        s3_client = S3Client(region_name=s3_region,
                             s3_endpoint_url=s3_endpoint_url)

        # Intermediate tournament files
        queue_pickle_name = 'tournament_candidate_queue.pkl'
        queue_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, queue_pickle_name))
        local_queue_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), queue_pickle_name))

        report_pickle_name = 'tournament_report.pkl'
        report_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, report_pickle_name))
        local_report_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), report_pickle_name))

        final_report_name = 'tournament_report.json'
        final_report_s3_key = os.path.normpath(
            os.path.join(s3_prefix, final_report_name))

        try:
            s3_client.download_file(bucket=s3_bucket,
                                    s3_key=queue_pickle_s3_key,
                                    local_path=local_queue_pickle_path)

            s3_client.download_file(bucket=s3_bucket,
                                    s3_key=report_pickle_s3_key,
                                    local_path=local_report_pickle_path)
        except:
            pass

        # download yaml file
        yaml_file = YamlFile(
            agent_type=AgentType.TOURNAMENT.value,
            bucket=s3_bucket,
            s3_key=get_s3_key(s3_prefix, s3_yaml_name),
            region_name=s3_region,
            s3_endpoint_url=s3_endpoint_url,
            local_path=YAML_LOCAL_PATH_FORMAT.format(s3_yaml_name))

        yaml_dict = yaml_file.get_yaml_values()

        if os.path.exists(local_queue_pickle_path):
            with open(local_queue_pickle_path, 'rb') as f:
                tournament_candidate_queue = pickle.load(f)
            with open(local_report_pickle_path, 'rb') as f:
                tournament_report = pickle.load(f)
            logger.info('tournament_candidate_queue loaded from existing file')
        else:
            logger.info('tournament_candidate_queue initialized')
            tournament_candidate_queue = deque()
            for agent_idx, _ in enumerate(
                    yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value]):
                tournament_candidate_queue.append((
                    yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MODEL_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MODEL_METADATA_FILE_S3_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.METRICS_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.METRICS_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.SIMTRACE_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.SIMTRACE_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MP4_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MP4_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.DISPLAY_NAME_YAML_KEY.value][agent_idx],
                    # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check
                    "" if None in yaml_dict.get(YamlKey.RACER_NAME_YAML_KEY.value, [None]) \
                        else yaml_dict[YamlKey.RACER_NAME_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.BODY_SHELL_TYPE_YAML_KEY.value][agent_idx]
                ))
            tournament_report = {"race_results": []}

        race_idx = len(tournament_report["race_results"])
        while len(tournament_candidate_queue) > 1:
            car1 = tournament_candidate_queue.popleft()
            car2 = tournament_candidate_queue.popleft()
            (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata,
             car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket,
             car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix,
             car1_display_name, car1_racer_name, car1_body_shell_type) = car1
            (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata,
             car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket,
             car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix,
             car2_display_name, car2_racer_name, car2_body_shell_type) = car2

            race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict,
                                                car1=car1,
                                                car2=car2,
                                                race_idx=race_idx)

            if s3_endpoint_url is not None:
                race_yaml_dict["S3_ENDPOINT_URL"] = s3_endpoint_url

            race_model_s3_buckets = [
                car1_model_s3_bucket, car2_model_s3_bucket
            ]
            race_model_metadatas = [car1_model_metadata, car2_model_metadata]
            body_shell_types = [car1_body_shell_type, car2_body_shell_type]

            # List of directories created
            dirs_to_delete = list()
            yaml_dir = os.path.abspath(os.path.join(os.getcwd(),
                                                    str(race_idx)))
            os.makedirs(yaml_dir)

            dirs_to_delete.append(yaml_dir)
            race_yaml_path = os.path.abspath(
                os.path.join(yaml_dir, 'evaluation_params.yaml'))
            with open(race_yaml_path, 'w') as race_yaml_file:
                yaml.dump(race_yaml_dict, race_yaml_file)

            # List of racecar names that should include second camera while launching
            racecars_with_stereo_cameras = list()
            # List of racecar names that should include lidar while launching
            racecars_with_lidars = list()
            # List of SimApp versions
            simapp_versions = list()
            for agent_index, model_s3_bucket in enumerate(
                    race_model_s3_buckets):
                racecar_name = 'racecar_' + str(agent_index)
                json_key = race_model_metadatas[agent_index]
                # download model metadata
                try:
                    model_metadata = ModelMetadata(
                        bucket=model_s3_bucket,
                        s3_key=json_key,
                        region_name=s3_region,
                        s3_endpoint_url=s3_endpoint_url,
                        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(
                            racecar_name))
                    dirs_to_delete.append(model_metadata.local_dir)
                except Exception as e:
                    log_and_exit(
                        "Failed to download model_metadata file: s3_bucket: {}, s3_key: {}, {}"
                        .format(model_s3_bucket, json_key,
                                e), SIMAPP_SIMULATION_WORKER_EXCEPTION,
                        SIMAPP_EVENT_ERROR_CODE_500)
                sensors, _, simapp_version = model_metadata.get_model_metadata_info(
                )
                simapp_versions.append(str(simapp_version))
                if Input.STEREO.value in sensors:
                    racecars_with_stereo_cameras.append(racecar_name)
                if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors:
                    racecars_with_lidars.append(racecar_name)

            cmd = [
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "tournament_race_node.py"),
                str(race_idx), race_yaml_path,
                ','.join(racecars_with_stereo_cameras),
                ','.join(racecars_with_lidars), ','.join(simapp_versions),
                ','.join(body_shell_types)
            ]
            try:
                return_code, _, stderr = run_cmd(cmd_args=cmd,
                                                 shell=False,
                                                 stdout=None,
                                                 stderr=None)
            except KeyboardInterrupt:
                logger.info(
                    "KeyboardInterrupt raised, SimApp must be faulted! exiting..."
                )
                return

            # Retrieve winner and append tournament report
            with open('race_report.pkl', 'rb') as f:
                race_report = pickle.load(f)
            race_report['race_idx'] = race_idx
            winner = car1 if race_report[
                'winner'] == car1_display_name else car2
            logger.info("race {}'s winner: {}".format(race_idx,
                                                      race_report['winner']))

            tournament_candidate_queue.append(winner)
            tournament_report["race_results"].append(race_report)

            # Clean up directories created
            for dir_to_delete in dirs_to_delete:
                shutil.rmtree(dir_to_delete, ignore_errors=True)
            race_idx += 1

            s3_extra_args = get_s3_kms_extra_args()
            # Persist latest queue and report to use after job restarts.
            with open(local_queue_pickle_path, 'wb') as f:
                pickle.dump(tournament_candidate_queue, f, protocol=2)
            s3_client.upload_file(bucket=s3_bucket,
                                  s3_key=queue_pickle_s3_key,
                                  local_path=local_queue_pickle_path,
                                  s3_kms_extra_args=s3_extra_args)

            with open(local_report_pickle_path, 'wb') as f:
                pickle.dump(tournament_report, f, protocol=2)

            s3_client.upload_file(bucket=s3_bucket,
                                  s3_key=report_pickle_s3_key,
                                  local_path=local_report_pickle_path,
                                  s3_kms_extra_args=s3_extra_args)

            # If there is more than 1 candidates then restart the simulation job otherwise
            # tournament is finished, persists final report and ends the job.
            if len(tournament_candidate_queue) > 1:
                restart_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
                break
            else:
                # Persist final tournament report in json format
                # and terminate the job by canceling it
                s3_client.put_object(bucket=s3_bucket,
                                     s3_key=final_report_s3_key,
                                     body=json.dumps(tournament_report),
                                     s3_kms_extra_args=s3_extra_args)

                cancel_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
    except ValueError as ex:
        log_and_exit("User modified model_metadata.json: {}".format(ex),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)
    except Exception as e:
        log_and_exit("Tournament node failed: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_500)
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num_workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=int(rospy.get_param("NUM_WORKERS", 1)))
    parser.add_argument('--rollout_idx',
                        help="(int) The index of current rollout worker",
                        type=int,
                        default=0)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        log_and_exit("Failed to import user's reward_function: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    step_metrics_prefix = rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX')
    if args.num_workers > 1:
        step_metrics_prefix = os.path.join(step_metrics_prefix,
                                           str(args.rollout_idx))
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')
    }
    metrics_s3_model_cfg = {
        MetricsS3Keys.METRICS_BUCKET.value:
        args.s3_bucket,
        MetricsS3Keys.METRICS_KEY.value:
        os.path.join(args.s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX),
        MetricsS3Keys.REGION.value:
        args.aws_region
    }
    run_phase_subject = RunPhaseSubject()

    agent_list = list()
    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics(agent_name='agent',
                            s3_dict_metrics=metrics_s3_config,
                            s3_dict_model=metrics_s3_model_cfg,
                            ckpnt_dir=args.checkpoint_dir,
                            run_phase_sink=run_phase_subject,
                            use_model_picker=(args.rollout_idx == 0)),
            run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET',
                                    None) if args.rollout_idx == 0 else None
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        if args.num_workers > 1:
            simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix,
                                                     str(args.rollout_idx))
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    s3_writer_job_info = []
    if simtrace_s3_bucket:
        s3_writer_job_info.append(
            IterationData(
                'simtrace', simtrace_s3_bucket, simtrace_s3_object_prefix,
                aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.SIM_TRACE_TRAINING_LOCAL_FILE.
                    value)))
    if mp4_s3_bucket:
        s3_writer_job_info.extend([
            IterationData(
                'pip', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                '45degree', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                'topview', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value))
        ])

    s3_writer = S3Writer(job_info=s3_writer_job_info)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s", redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as filepointer:
            sm_hyperparams_dict = json.load(filepointer)
    else:
        logger.info("SageMaker hyperparameters not found.")

    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))
    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject,
            enable_domain_randomization=enable_domain_randomization)

    # If num_episodes_between_training is smaller than num_workers then cancel worker early.
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    # Reduce number of workers if allocated more than num_episodes_between_training
    if args.num_workers > episode_steps_per_rollout:
        logger.info(
            "Excess worker allocated. Reducing from {} to {}...".format(
                args.num_workers, episode_steps_per_rollout))
        args.num_workers = episode_steps_per_rollout
    if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers:
        err_msg_format = "Exiting excess worker..."
        err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])"
        logger.info(
            err_msg_format.format(args.rollout_idx, args.num_workers,
                                  episode_steps_per_rollout))
        # Close the down the job
        utils.cancel_simulation_job(
            os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
            rospy.get_param('AWS_REGION'))

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix,
        num_workers=args.num_workers,
        rollout_idx=args.rollout_idx)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names={'agent': args.s3_bucket},
        base_checkpoint_dir=args.checkpoint_dir,
        s3_folders={'agent': args.s3_prefix})

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   rollout_idx=args.rollout_idx,
                   task_parameters=task_parameters,
                   s3_writer=s3_writer)
def evaluation_worker(graph_manager, number_of_trials, task_parameters,
                      s3_writers, is_continuous):
    """ Evaluation worker function

    Arguments:
        graph_manager {[MultiAgentGraphManager]} -- [Graph manager of multiagent graph manager]
        number_of_trials {[int]} -- [Number of trails you want to run the evaluation]
        task_parameters {[TaskParameters]} -- [Information of the checkpoint, gpu/cpu, framework etc of rlcoach]
        s3_writers {[S3Writer]} -- [Information to upload to the S3 bucket all the simtrace and mp4]
        is_continuous {bool} -- [The termination condition for the car]
    """
    checkpoint_dirs = list()
    agent_names = list()
    subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(
    ), list()
    subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
    for agent_param in graph_manager.agents_params:
        _checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1\
            else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name)
        agent_names.append(agent_param.name)
        checkpoint_dirs.append(_checkpoint_dir)
        racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1\
                                 else "racecar_{}".format(agent_param.name.split("_")[1])
        subscribe_to_save_mp4_topic.append(
            "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
        unsubscribe_from_save_mp4_topic.append(
            "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
    wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store)
    modify_checkpoint_variables(checkpoint_dirs, agent_names)

    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)

    for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                  unsubscribe_from_save_mp4_topic):
        rospy.wait_for_service(mp4_sub)
        rospy.wait_for_service(mp4_unsub)
    for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                  unsubscribe_from_save_mp4_topic):
        subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty))
        unsubscribe_from_save_mp4.append(ServiceProxyWrapper(mp4_unsub, Empty))

    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)
    logger.info(
        "Graph manager successfully created the graph: Unpausing physics")
    unpause_physics(EmptyRequest())
    graph_manager.reset_internal_state(True)

    is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None)
    if is_save_mp4_enabled:
        for subscribe_mp4 in subscribe_to_save_mp4:
            subscribe_mp4(EmptyRequest())
    if is_continuous:
        graph_manager.evaluate(EnvironmentSteps(1))
    else:
        for _ in range(number_of_trials):
            graph_manager.evaluate(EnvironmentSteps(1))
    if is_save_mp4_enabled:
        for unsubscribe_mp4 in unsubscribe_from_save_mp4:
            unsubscribe_mp4(EmptyRequest())
    for s3_writer in s3_writers:
        s3_writer.upload_to_s3()
    time.sleep(1)
    pause_physics(EmptyRequest())
    # Close the down the job
    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))
def main():
    """ Main function for tournament"""
    try:
        # parse argument
        s3_region = sys.argv[1]
        s3_bucket = sys.argv[2]
        s3_prefix = sys.argv[3]
        s3_yaml_name = sys.argv[4]

        # create boto3 session/client and download yaml/json file
        session = boto3.session.Session()
        s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None)
        s3_client = session.client('s3',
                                   region_name=s3_region,
                                   endpoint_url=s3_endpoint_url,
                                   config=get_boto_config())

        yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
        local_yaml_path = os.path.abspath(
            os.path.join(os.getcwd(), s3_yaml_name))
        s3_client.download_file(Bucket=s3_bucket,
                                Key=yaml_key,
                                Filename=local_yaml_path)

        # Intermediate tournament files
        queue_pickle_name = 'tournament_candidate_queue.pkl'
        queue_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, queue_pickle_name))
        local_queue_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), queue_pickle_name))

        report_pickle_name = 'tournament_report.pkl'
        report_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, report_pickle_name))
        local_report_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), report_pickle_name))

        final_report_name = 'tournament_report.json'
        final_report_s3_key = os.path.normpath(
            os.path.join(s3_prefix, final_report_name))

        try:
            s3_client.download_file(Bucket=s3_bucket,
                                    Key=queue_pickle_s3_key,
                                    Filename=local_queue_pickle_path)
            s3_client.download_file(Bucket=s3_bucket,
                                    Key=report_pickle_s3_key,
                                    Filename=local_report_pickle_path)
        except:
            pass

        # Get values passed in yaml files. Default values are for backward compatibility and for single racecar racing
        yaml_dict = get_yaml_dict(local_yaml_path)

        # Forcing the yaml parameter to list
        force_list_params = [
            MODEL_S3_BUCKET_YAML_KEY, MODEL_S3_PREFIX_YAML_KEY,
            MODEL_METADATA_FILE_S3_YAML_KEY, METRICS_S3_BUCKET_YAML_KEY,
            METRICS_S3_PREFIX_YAML_KEY, SIMTRACE_S3_BUCKET_YAML_KEY,
            SIMTRACE_S3_PREFIX_YAML_KEY, MP4_S3_BUCKET_YAML_KEY,
            MP4_S3_PREFIX_YAML_KEY, DISPLAY_NAME_YAML_KEY
        ]
        for params in force_list_params:
            yaml_dict[params] = force_list(yaml_dict.get(params, None))

        # Populate the model_metadata_s3_key values to handle both training and evaluation for all race_formats
        if None in yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY]:
            # MODEL_METADATA_FILE_S3_KEY not passed as part of yaml file ==> This happens during evaluation
            # Assume model_metadata.json is present in the s3_prefix/model/ folder
            yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY] = list()
            for s3_prefix in yaml_dict[MODEL_S3_PREFIX_YAML_KEY]:
                yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY].append(
                    os.path.join(s3_prefix, 'model/model_metadata.json'))

        # Validate the yaml values
        validate_yaml_values(yaml_dict)
        if os.path.exists(local_queue_pickle_path):
            with open(local_queue_pickle_path, 'rb') as f:
                tournament_candidate_queue = pickle.load(f)
            with open(local_report_pickle_path, 'rb') as f:
                tournament_report = pickle.load(f)
            logger.info('tournament_candidate_queue loaded from existing file')
        else:
            logger.info('tournament_candidate_queue initialized')
            tournament_candidate_queue = deque()
            for agent_idx, _ in enumerate(yaml_dict[MODEL_S3_BUCKET_YAML_KEY]):
                tournament_candidate_queue.append(
                    (yaml_dict[MODEL_S3_BUCKET_YAML_KEY][agent_idx],
                     yaml_dict[MODEL_S3_PREFIX_YAML_KEY][agent_idx],
                     yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY][agent_idx],
                     yaml_dict[METRICS_S3_BUCKET_YAML_KEY][agent_idx],
                     yaml_dict[METRICS_S3_PREFIX_YAML_KEY][agent_idx],
                     yaml_dict[SIMTRACE_S3_BUCKET_YAML_KEY][agent_idx],
                     yaml_dict[SIMTRACE_S3_PREFIX_YAML_KEY][agent_idx],
                     yaml_dict[MP4_S3_BUCKET_YAML_KEY][agent_idx],
                     yaml_dict[MP4_S3_PREFIX_YAML_KEY][agent_idx],
                     yaml_dict[DISPLAY_NAME_YAML_KEY][agent_idx]))
            tournament_report = []

        race_idx = len(tournament_report)
        while len(tournament_candidate_queue) > 1:
            car1 = tournament_candidate_queue.popleft()
            car2 = tournament_candidate_queue.popleft()
            (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata,
             car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket,
             car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix,
             car1_display_name) = car1
            (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata,
             car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket,
             car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix,
             car2_display_name) = car2

            race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict,
                                                car1=car1,
                                                car2=car2,
                                                race_idx=race_idx)

            race_car_colors = ["Orange", "Purple"]
            race_model_s3_buckets = [
                car1_model_s3_bucket, car2_model_s3_bucket
            ]
            race_model_metadatas = [car1_model_metadata, car2_model_metadata]

            # List of directories created
            dirs_to_delete = list()
            yaml_dir = os.path.abspath(os.path.join(os.getcwd(),
                                                    str(race_idx)))
            os.makedirs(yaml_dir)

            dirs_to_delete.append(yaml_dir)
            race_yaml_path = os.path.abspath(
                os.path.join(yaml_dir, 'evaluation_params.yaml'))
            with open(race_yaml_path, 'w') as race_yaml_file:
                yaml.dump(race_yaml_dict, race_yaml_file)

            # List of racecar names that should include second camera while launching
            racecars_with_stereo_cameras = list()
            # List of racecar names that should include lidar while launching
            racecars_with_lidars = list()
            # List of SimApp versions
            simapp_versions = list()
            for agent_index, model_s3_bucket in enumerate(
                    race_model_s3_buckets):
                racecar_name = 'racecar_' + str(agent_index)
                # Make a local folder with the racecar name to download the model_metadata.json
                os.makedirs(os.path.join(os.getcwd(), racecar_name))
                dirs_to_delete.append(os.path.join(os.getcwd(), racecar_name))
                local_model_metadata_path = os.path.abspath(
                    os.path.join(os.path.join(os.getcwd(), racecar_name),
                                 'model_metadata.json'))
                json_key = race_model_metadatas[agent_index]
                json_key = json_key.replace('s3://{}/'.format(model_s3_bucket),
                                            '')
                s3_client.download_file(Bucket=model_s3_bucket,
                                        Key=json_key,
                                        Filename=local_model_metadata_path)
                sensors, _, simapp_version = utils_parse_model_metadata.parse_model_metadata(
                    local_model_metadata_path)
                simapp_versions.append(simapp_version)
                if Input.STEREO.value in sensors:
                    racecars_with_stereo_cameras.append(racecar_name)
                if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors:
                    racecars_with_lidars.append(racecar_name)

            cmd = [
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "tournament_race_node.py"),
                str(race_idx), race_yaml_path,
                ','.join(racecars_with_stereo_cameras),
                ','.join(racecars_with_lidars), ','.join(race_car_colors),
                ','.join(simapp_versions)
            ]
            try:
                return_code, _, stderr = run_cmd(cmd_args=cmd,
                                                 shell=False,
                                                 stdout=None,
                                                 stderr=None)
            except KeyboardInterrupt:
                logger.info(
                    "KeyboardInterrupt raised, SimApp must be faulted! exiting..."
                )
                return

            # Retrieve winner and append tournament report
            with open('race_report.pkl', 'rb') as f:
                race_report = pickle.load(f)
            race_report['race_idx'] = race_idx
            winner = car1 if race_report[
                'winner'] == car1_display_name else car2
            logger.info("race {}'s winner: {}".format(race_idx,
                                                      race_report['winner']))

            tournament_candidate_queue.append(winner)
            tournament_report.append(race_report)

            # Clean up directories created
            for dir_to_delete in dirs_to_delete:
                shutil.rmtree(dir_to_delete, ignore_errors=True)
            race_idx += 1

            # Persist latest queue and report to use after job restarts.
            with open(local_queue_pickle_path, 'wb') as f:
                pickle.dump(tournament_candidate_queue, f, protocol=2)
            s3_client.upload_file(Filename=local_queue_pickle_path,
                                  Bucket=s3_bucket,
                                  Key=queue_pickle_s3_key)

            with open(local_report_pickle_path, 'wb') as f:
                pickle.dump(tournament_report, f, protocol=2)
            s3_client.upload_file(Filename=local_report_pickle_path,
                                  Bucket=s3_bucket,
                                  Key=report_pickle_s3_key)

            # If there is more than 1 candidates then restart the simulation job otherwise
            # tournament is finished, persists final report and ends the job.
            if len(tournament_candidate_queue) > 1:
                restart_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
                break
            else:
                # Persist final tournament report in json format
                # and terminate the job by canceling it
                s3_client.put_object(Bucket=s3_bucket,
                                     Key=final_report_s3_key,
                                     Body=json.dumps(tournament_report))

                cancel_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
    except Exception as e:
        log_and_exit(
            "Tournament node failed: s3_bucket: {}, yaml_key: {}, {}".format(
                s3_bucket, yaml_key, e), SIMAPP_SIMULATION_WORKER_EXCEPTION,
            SIMAPP_EVENT_ERROR_CODE_500)
Example #6
0
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num_workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=int(rospy.get_param("NUM_WORKERS", 1)))
    parser.add_argument('--rollout_idx',
                        help="(int) The index of current rollout worker",
                        type=int,
                        default=0)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)

    # Download and import reward function
    # TODO: replace 'agent' with name of each agent for multi-agent training
    reward_function_file = RewardFunction(
        bucket=args.s3_bucket,
        s3_key=args.reward_file_s3_key,
        region_name=args.aws_region,
        local_path=REWARD_FUCTION_LOCAL_PATH_FORMAT.format('agent'))
    reward_function = reward_function_file.get_reward_function()

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_bucket=args.s3_bucket,
        s3_prefix=args.s3_prefix,
        aws_region=args.aws_region)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=args.model_metadata_s3_key,
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_info = model_metadata.get_model_metadata_info()
    version = model_metadata_info[ModelMetadataKeys.VERSION.value]

    agent_config = {
        'model_metadata': model_metadata,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.MODEL_METADATA.value:
            model_metadata,
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_key = rospy.get_param('METRICS_S3_OBJECT_KEY')
    if args.num_workers > 1 and args.rollout_idx > 0:
        key_tuple = os.path.splitext(metrics_key)
        metrics_key = "{}_{}{}".format(key_tuple[0], str(args.rollout_idx),
                                       key_tuple[1])
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value: metrics_key,
        MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')
    }

    run_phase_subject = RunPhaseSubject()

    agent_list = list()

    #TODO: replace agent for multi agent training
    # checkpoint s3 instance
    # TODO replace agent with agent_0 and so on for multiagent case
    checkpoint = Checkpoint(bucket=args.s3_bucket,
                            s3_prefix=args.s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=args.checkpoint_dir)

    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics(
                agent_name='agent',
                s3_dict_metrics=metrics_s3_config,
                deepracer_checkpoint_json=checkpoint.deepracer_checkpoint_json,
                ckpnt_dir=os.path.join(args.checkpoint_dir, 'agent'),
                run_phase_sink=run_phase_subject,
                use_model_picker=(args.rollout_idx == 0)), run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET',
                                    None) if args.rollout_idx == 0 else None
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        if args.num_workers > 1:
            simtrace_s3_object_prefix = os.path.join(simtrace_s3_object_prefix,
                                                     str(args.rollout_idx))
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    simtrace_video_s3_writers = []
    #TODO: replace 'agent' with 'agent_0' for multi agent training and
    # mp4_s3_object_prefix, mp4_s3_bucket will be a list, so need to access with index
    if simtrace_s3_bucket:
        simtrace_video_s3_writers.append(
            SimtraceVideo(
                upload_type=SimtraceVideoNames.SIMTRACE_TRAINING.value,
                bucket=simtrace_s3_bucket,
                s3_prefix=simtrace_s3_object_prefix,
                region_name=aws_region,
                local_path=SIMTRACE_TRAINING_LOCAL_PATH_FORMAT.format(
                    'agent')))
    if mp4_s3_bucket:
        simtrace_video_s3_writers.extend([
            SimtraceVideo(
                upload_type=SimtraceVideoNames.PIP.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format('agent')),
            SimtraceVideo(
                upload_type=SimtraceVideoNames.DEGREE45.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format('agent')),
            SimtraceVideo(
                upload_type=SimtraceVideoNames.TOPVIEW.value,
                bucket=mp4_s3_bucket,
                s3_prefix=mp4_s3_object_prefix,
                region_name=aws_region,
                local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format('agent'))
        ])

    # TODO: replace 'agent' with specific agent name for multi agent training
    ip_config = IpConfig(bucket=args.s3_bucket,
                         s3_prefix=args.s3_prefix,
                         region_name=args.aws_region,
                         local_path=IP_ADDRESS_LOCAL_PATH.format('agent'))
    redis_ip = ip_config.get_ip_config()

    # Download hyperparameters from SageMaker shared s3 bucket
    # TODO: replace 'agent' with name of each agent
    hyperparameters = Hyperparameters(
        bucket=args.s3_bucket,
        s3_key=get_s3_key(args.s3_prefix, HYPERPARAMETER_S3_POSTFIX),
        region_name=args.aws_region,
        local_path=HYPERPARAMETER_LOCAL_PATH_FORMAT.format('agent'))
    sm_hyperparams_dict = hyperparameters.get_hyperparameters_dict()

    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject,
            enable_domain_randomization=enable_domain_randomization,
            pause_physics=pause_physics,
            unpause_physics=unpause_physics)

    # If num_episodes_between_training is smaller than num_workers then cancel worker early.
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    # Reduce number of workers if allocated more than num_episodes_between_training
    if args.num_workers > episode_steps_per_rollout:
        logger.info(
            "Excess worker allocated. Reducing from {} to {}...".format(
                args.num_workers, episode_steps_per_rollout))
        args.num_workers = episode_steps_per_rollout
    if args.rollout_idx >= episode_steps_per_rollout or args.rollout_idx >= args.num_workers:
        err_msg_format = "Exiting excess worker..."
        err_msg_format += "(rollout_idx[{}] >= num_workers[{}] or num_episodes_between_training[{}])"
        logger.info(
            err_msg_format.format(args.rollout_idx, args.num_workers,
                                  episode_steps_per_rollout))
        # Close the down the job
        utils.cancel_simulation_job()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix,
        num_workers=args.num_workers,
        rollout_idx=args.rollout_idx)

    graph_manager.memory_backend_params = memory_backend_params

    checkpoint_dict = {'agent': checkpoint}
    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   rollout_idx=args.rollout_idx,
                   task_parameters=task_parameters,
                   simtrace_video_s3_writers=simtrace_video_s3_writers,
                   pause_physics=pause_physics,
                   unpause_physics=unpause_physics)
Example #7
0
def evaluation_worker(graph_manager, number_of_trials, task_parameters,
                      simtrace_video_s3_writers, is_continuous, park_positions,
                      race_type, pause_physics, unpause_physics):
    """ Evaluation worker function

    Arguments:
        graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager
        number_of_trials(int): Number of trails you want to run the evaluation
        task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu,
            framework etc of rlcoach
        simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4
        is_continuous(bool): The termination condition for the car
        park_positions(list of tuple): list of (x, y) for cars to park at
        race_type (str): race type
    """
    # Collect profiler information only IS_PROFILER_ON is true
    with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                        s3_prefix=PROFILER_S3_PREFIX,
                        output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                        enable_profiling=IS_PROFILER_ON):
        subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(
        ), list()
        subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
        for agent_param in graph_manager.agents_params:
            racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \
                                     else "racecar_{}".format(agent_param.name.split("_")[1])
            subscribe_to_save_mp4_topic.append(
                "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
            unsubscribe_from_save_mp4_topic.append(
                "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
        graph_manager.data_store.wait_for_checkpoints()
        graph_manager.data_store.modify_checkpoint_variables()

        # wait for the required cancel services to become available
        if race_type != RaceType.F1.value:
            # TODO: Since we are not running Grand Prix in RoboMaker,
            # we are opting out from waiting for RoboMaker's cancel job service
            # in case of Grand Prix execution.
            # Otherwise, SimApp will hang as service will never come alive.
            #
            # If we don't depend on RoboMaker anymore in the future,
            # we need to remove below line, or do a better job to figure out
            # whether we are running on RoboMaker or not to decide whether
            # we should wait for below service or not.
            rospy.wait_for_service('/robomaker/job/cancel')

        # Make the clients that will allow us to pause and unpause the physics
        rospy.wait_for_service('/gazebo/pause_physics_dr')
        rospy.wait_for_service('/gazebo/unpause_physics_dr')
        pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
        unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr',
                                              Empty)

        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            rospy.wait_for_service(mp4_sub)
            rospy.wait_for_service(mp4_unsub)
        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty))
            unsubscribe_from_save_mp4.append(
                Thread(target=ServiceProxyWrapper(mp4_unsub, Empty),
                       args=(EmptyRequest(), )))

        graph_manager.create_graph(task_parameters=task_parameters,
                                   stop_physics=pause_physics,
                                   start_physics=unpause_physics,
                                   empty_service_call=EmptyRequest)
        logger.info(
            "Graph manager successfully created the graph: Unpausing physics")
        unpause_physics(EmptyRequest())

        is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None)
        if is_save_mp4_enabled:
            for subscribe_mp4 in subscribe_to_save_mp4:
                subscribe_mp4(EmptyRequest())

        configure_environment_randomizer()
        track_data = TrackData.get_instance()

        # Before each evaluation episode (single lap for non-continuous race and complete race for
        # continuous race), a new copy of park_positions needs to be loaded into track_data because
        # a park position will be pop from park_positions when a racer car need to be parked.
        if is_continuous:
            track_data.park_positions = park_positions
            graph_manager.evaluate(EnvironmentSteps(1))
        else:
            for _ in range(number_of_trials):
                track_data.park_positions = park_positions
                graph_manager.evaluate(EnvironmentSteps(1))
        if is_save_mp4_enabled:
            for unsubscribe_mp4 in unsubscribe_from_save_mp4:
                unsubscribe_mp4.start()
            for unsubscribe_mp4 in unsubscribe_from_save_mp4:
                unsubscribe_mp4.join()
        # upload simtrace and mp4 into s3 bucket
        for s3_writer in simtrace_video_s3_writers:
            s3_writer.persist(utils.get_s3_kms_extra_args())
        time.sleep(1)
        pause_physics(EmptyRequest())

    if race_type != RaceType.F1.value:
        # Close the down the job
        utils.cancel_simulation_job()
Example #8
0
def evaluation_worker(graph_manager, number_of_trials, task_parameters,
                      simtrace_video_s3_writers, is_continuous,
                      park_positions):
    """ Evaluation worker function

    Arguments:
        graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager
        number_of_trials(int): Number of trails you want to run the evaluation
        task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu,
            framework etc of rlcoach
        simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4
        is_continuous(bool): The termination condition for the car
        park_positions(list of tuple): list of (x, y) for cars to park at
    """
    # Collect profiler information only IS_PROFILER_ON is true
    with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                        s3_prefix=PROFILER_S3_PREFIX,
                        output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                        enable_profiling=IS_PROFILER_ON):
        subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(
        ), list()
        subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
        for agent_param in graph_manager.agents_params:
            racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \
                                     else "racecar_{}".format(agent_param.name.split("_")[1])
            subscribe_to_save_mp4_topic.append(
                "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
            unsubscribe_from_save_mp4_topic.append(
                "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
        graph_manager.data_store.wait_for_checkpoints()
        graph_manager.data_store.modify_checkpoint_variables()

        # Make the clients that will allow us to pause and unpause the physics
        rospy.wait_for_service('/gazebo/pause_physics_dr')
        rospy.wait_for_service('/gazebo/unpause_physics_dr')
        pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
        unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr',
                                              Empty)

        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            rospy.wait_for_service(mp4_sub)
            rospy.wait_for_service(mp4_unsub)
        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty))
            unsubscribe_from_save_mp4.append(
                ServiceProxyWrapper(mp4_unsub, Empty))

        graph_manager.create_graph(task_parameters=task_parameters,
                                   stop_physics=pause_physics,
                                   start_physics=unpause_physics,
                                   empty_service_call=EmptyRequest)
        logger.info(
            "Graph manager successfully created the graph: Unpausing physics")
        unpause_physics(EmptyRequest())

        is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None)
        if is_save_mp4_enabled:
            for subscribe_mp4 in subscribe_to_save_mp4:
                subscribe_mp4(EmptyRequest())

        configure_environment_randomizer()
        track_data = TrackData.get_instance()

        # Before each evaluation episode (single lap for non-continuous race and complete race for
        # continuous race), a new copy of park_positions needs to be loaded into track_data because
        # a park position will be pop from park_positions when a racer car need to be parked.
        if is_continuous:
            track_data.park_positions = park_positions
            graph_manager.evaluate(EnvironmentSteps(1))
        else:
            for _ in range(number_of_trials):
                track_data.park_positions = park_positions
                graph_manager.evaluate(EnvironmentSteps(1))
        if is_save_mp4_enabled:
            for unsubscribe_mp4 in unsubscribe_from_save_mp4:
                unsubscribe_mp4(EmptyRequest())
        # upload simtrace and mp4 into s3 bucket
        for s3_writer in simtrace_video_s3_writers:
            s3_writer.persist(utils.get_s3_kms_extra_args())
        time.sleep(1)
        pause_physics(EmptyRequest())

    # Close the down the job
    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))
Example #9
0
def main():
    """ Main function for virutal event manager """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--queue_url',
        help='the sqs queue url to receive next racer information',
        type=str,
        default=str(rospy.get_param("SQS_QUEUE_URL", "sqs_queue_url")))
    parser.add_argument('--race_duration',
                        help='the length of the race in seconds.',
                        type=int,
                        default=int(
                            rospy.get_param("RACE_DURATION",
                                            DEFAULT_RACE_DURATION)))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_TRIALS", 3)))
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 2.0)))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      2.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      5.0)))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--body_shell_type',
                        help='(string) body shell type',
                        type=str,
                        default=rospy.get_param("BODY_SHELL_TYPE",
                                                "deepracer"))

    args = parser.parse_args()
    manager = VirtualEventManager(queue_url=args.queue_url,
                                  aws_region=args.aws_region,
                                  race_duration=args.race_duration,
                                  number_of_trials=args.number_of_trials,
                                  number_of_resets=args.number_of_resets,
                                  penalty_seconds=args.penalty_seconds,
                                  off_track_penalty=args.off_track_penalty,
                                  collision_penalty=args.collision_penalty,
                                  is_continuous=args.is_continuous,
                                  race_type=args.race_type,
                                  body_shell_type=args.body_shell_type)
    while True:
        # poll for next racer
        if not manager.is_event_end and manager.current_racer is None:
            LOG.info("[virtual event worker] polling for next racer.")
            manager.poll_next_racer()

        # if event end signal received, break out loop and finish the job
        if manager.is_event_end:
            LOG.info("[virtual event worker] received event end.")
            break
        # Setting up the race environment
        if manager.setup_race():
            # proceed with start and finish race only if setup is successful.
            # Start race
            manager.start_race()
            # Finish race
            manager.finish_race()

    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))