Esempio n. 1
0
def construct_memory_params(json: dict):

    if json['store_type'] == 'redispubsub':
        memory_params = RedisPubSubMemoryBackendParameters(
            json['redis_address'], json['redis_port'], channel=json.get('channel', ''), run_type=json['run_type']
        )
        return memory_params
Esempio n. 2
0
def handle_distributed_coach_orchestrator(graph_manager, args):
    ckpt_inside_container = "/checkpoint"
    rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
    trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]

    if '--experiment_name' not in rollout_command:
        rollout_command = rollout_command + ['--experiment_name', args.experiment_name]

    if '--experiment_name' not in trainer_command:
        trainer_command = trainer_command + ['--experiment_name', args.experiment_name]

    memory_backend_params = None
    if args.memory_backend == "redispubsub":
        memory_backend_params = RedisPubSubMemoryBackendParameters()

    ds_params_instance = None
    if args.data_store == "s3":
        ds_params = DataStoreParameters("s3", "", "")
        ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
                                                   creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
    elif args.data_store == "nfs":
        ds_params = DataStoreParameters("nfs", "kubernetes", "")
        ds_params_instance = NFSDataStoreParameters(ds_params)

    worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
    trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))

    orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
                                                kubeconfig='~/.kube/config',
                                                memory_backend_parameters=memory_backend_params,
                                                data_store_params=ds_params_instance)
    orchestrator = Kubernetes(orchestration_params)
    if not orchestrator.setup():
        print("Could not setup.")
        return

    if orchestrator.deploy_trainer():
        print("Successfully deployed trainer.")
    else:
        print("Could not deploy trainer.")
        return

    if orchestrator.deploy_worker():
        print("Successfully deployed rollout worker(s).")
    else:
        print("Could not deploy rollout worker(s).")
        return

    try:
        orchestrator.trainer_logs()
    except KeyboardInterrupt:
        pass

    orchestrator.undeploy()
Esempio n. 3
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, unknown = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)
    s3_client.upload_file(
        os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
        model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_environment = False
    if args.environment_s3_key:
        environment_local_path = "./markov/environments/deepracer_racetrack_env.py"
        success_custom_environment = s3_client.download_file(
            s3_key=args.environment_s3_key, local_path=environment_local_path)
        if not success_custom_environment:
            print(
                "Could not download the environment file. Using the default DeepRacer environment."
            )
        else:
            success_custom_environment = s3_client.upload_file(
                s3_key=os.path.normpath(
                    "%s/environments/deepracer_racetrack_env.py" %
                    args.s3_prefix),
                local_path=environment_local_path)
            if success_custom_environment:
                print("Using environment: %s" % args.environment_s3_key)

    # Import to register the environment with Gym
    import markov.environments

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(
            s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            print(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path)
            if success_custom_preset:
                print("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        from markov.sagemaker_graph_manager import get_graph_manager
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}
        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            **sm_hyperparams_dict)
        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        print("Uploaded hyperparameters.json to S3")

    host_ip_address = get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    print("Uploaded IP address information to S3: %s" % host_ip_address)

    use_pretrained_model = False
    if args.pretrained_s3_bucket and args.pretrained_s3_prefix:
        s3_client_pretrained = SageS3Client(
            bucket=args.pretrained_s3_bucket,
            s3_prefix=args.pretrained_s3_prefix,
            aws_region=args.aws_region)
        s3_client_pretrained.download_model(args.pretrained_checkpoint_dir)
        use_pretrained_model = True

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type='trainer',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    training_worker(graph_manager=graph_manager,
                    checkpoint_dir=args.checkpoint_dir,
                    use_pretrained_model=use_pretrained_model,
                    framework=args.framework)
Esempio n. 4
0
def handle_distributed_coach_orchestrator(args):
    from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, \
        RunTypeParameters

    ckpt_inside_container = "/checkpoint"
    arg_list = sys.argv[1:]
    try:
        i = arg_list.index('--distributed_coach_run_type')
        arg_list.pop(i)
        arg_list.pop(i)
    except ValueError:
        pass

    trainer_command = [
        'python3', 'rl_coach/coach.py', '--distributed_coach_run_type',
        str(RunType.TRAINER)
    ] + arg_list
    rollout_command = [
        'python3', 'rl_coach/coach.py', '--distributed_coach_run_type',
        str(RunType.ROLLOUT_WORKER)
    ] + arg_list

    if '--experiment_name' not in rollout_command:
        rollout_command = rollout_command + [
            '--experiment_name', args.experiment_name
        ]

    if '--experiment_name' not in trainer_command:
        trainer_command = trainer_command + [
            '--experiment_name', args.experiment_name
        ]

    memory_backend_params = None
    if args.memory_backend == "redispubsub":
        memory_backend_params = RedisPubSubMemoryBackendParameters()

    ds_params_instance = None
    if args.data_store == "s3":
        ds_params = DataStoreParameters("s3", "", "")
        ds_params_instance = S3DataStoreParameters(
            ds_params=ds_params,
            end_point=args.s3_end_point,
            bucket_name=args.s3_bucket_name,
            creds_file=args.s3_creds_file,
            checkpoint_dir=ckpt_inside_container,
            expt_dir=args.experiment_path)
    elif args.data_store == "nfs":
        ds_params = DataStoreParameters("nfs", "kubernetes", "")
        ds_params_instance = NFSDataStoreParameters(ds_params)

    worker_run_type_params = RunTypeParameters(args.image,
                                               rollout_command,
                                               run_type=str(
                                                   RunType.ROLLOUT_WORKER),
                                               num_replicas=args.num_workers)
    trainer_run_type_params = RunTypeParameters(args.image,
                                                trainer_command,
                                                run_type=str(RunType.TRAINER))

    orchestration_params = KubernetesParameters(
        [worker_run_type_params, trainer_run_type_params],
        kubeconfig='~/.kube/config',
        memory_backend_parameters=memory_backend_params,
        data_store_params=ds_params_instance)
    orchestrator = Kubernetes(orchestration_params)
    if not orchestrator.setup():
        print("Could not setup.")
        return 1

    if orchestrator.deploy_trainer():
        print("Successfully deployed trainer.")
    else:
        print("Could not deploy trainer.")
        return 1

    if orchestrator.deploy_worker():
        print("Successfully deployed rollout worker(s).")
    else:
        print("Could not deploy rollout worker(s).")
        return 1

    if args.dump_worker_logs:
        screen.log_title("Dumping rollout worker logs in: {}".format(
            args.experiment_path))
        orchestrator.worker_logs(path=args.experiment_path)

    exit_code = 1
    try:
        exit_code = orchestrator.trainer_logs()
    except KeyboardInterrupt:
        pass

    orchestrator.undeploy()
    return exit_code
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk', '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('-ek', '--environment_s3_key',
                        help="(string) Name of an environment file to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument('-c', '--checkpoint-dir',
                        help='(string) Path to a folder containing a checkpoint to write the model to.',
                        type=str,
                        default='./checkpoint')
    parser.add_argument('--pretrained-checkpoint-dir',
                        help='(string) Path to a folder for downloading a pre-trained model',
                        type=str,
                        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    start_redis_server()

    args, _ = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH, 'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key, model_metadata_local_path)
    s3_client.upload_file(os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix), model_metadata_local_path)
    shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            logger.info("Could not download the preset file. Using the default DeepRacer preset.")
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location, ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path)
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {'model_metadata': model_metadata_local_path,
                        'car_ctrl_cnfig': {ConfigParams.LINK_NAME_LIST.value: [],
                                           ConfigParams.VELOCITY_LIST.value : {},
                                           ConfigParams.STEERING_LIST.value : {},
                                           ConfigParams.CHANGE_START.value : None,
                                           ConfigParams.ALT_DIR.value : None,
                                           ConfigParams.ACTION_SPACE_PATH.value : 'custom_files/model_metadata.json',
                                           ConfigParams.REWARD.value : None,
                                           ConfigParams.AGENT_NAME.value : 'racecar'}}

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))
        #agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(sm_hyperparams_dict, agent_list)

        s3_client.upload_hyperparameters(robomaker_hyperparams_json)
        logger.info("Uploaded hyperparameters.json to S3")

    host_ip_address = utils.get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    logger.info("Uploaded IP address information to S3: %s" % host_ip_address)
    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    if use_pretrained_model:
        # Handle backward compatibility
        _, _, version = parse_model_metadata(model_metadata_local_path)
        if float(version) < float(utils.SIMAPP_VERSION) and \
        not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region):
            utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix,
                                  args.aws_region, SyncFiles.TRAINER_READY.value)

        ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                                  bucket_name=args.pretrained_s3_bucket,
                                                                  checkpoint_dir=args.pretrained_checkpoint_dir,
                                                                  s3_folder=args.pretrained_s3_prefix)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained)
        data_store_pretrained.load_from_store()

    memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address="localhost",
                                                               redis_port=6379,
                                                               run_type=str(RunType.TRAINER),
                                                               channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                   bucket_name=args.s3_bucket,
                                                   checkpoint_dir=args.checkpoint_dir,
                                                   s3_folder=args.s3_prefix)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(robomaker_hyperparams_json)["num_episodes_between_training"]
    )
def main():
    screen.set_use_colors(False)

    try:
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '-pk',
            '--preset_s3_key',
            help="(string) Name of a preset to download from S3",
            type=str,
            required=False)
        parser.add_argument(
            '-ek',
            '--environment_s3_key',
            help="(string) Name of an environment file to download from S3",
            type=str,
            required=False)
        parser.add_argument('--model_metadata_s3_key',
                            help="(string) Model Metadata File S3 Key",
                            type=str,
                            required=False)
        parser.add_argument(
            '-c',
            '--checkpoint-dir',
            help=
            '(string) Path to a folder containing a checkpoint to write the model to.',
            type=str,
            default='./checkpoint')
        parser.add_argument(
            '--pretrained-checkpoint-dir',
            help=
            '(string) Path to a folder for downloading a pre-trained model',
            type=str,
            default=PRETRAINED_MODEL_DIR)
        parser.add_argument('--s3_bucket',
                            help='(string) S3 bucket',
                            type=str,
                            default=os.environ.get(
                                "SAGEMAKER_SHARED_S3_BUCKET_PATH",
                                "gsaur-test"))
        parser.add_argument('--s3_prefix',
                            help='(string) S3 prefix',
                            type=str,
                            default='sagemaker')
        parser.add_argument('--framework',
                            help='(string) tensorflow or mxnet',
                            type=str,
                            default='tensorflow')
        parser.add_argument('--pretrained_s3_bucket',
                            help='(string) S3 bucket for pre-trained model',
                            type=str)
        parser.add_argument('--pretrained_s3_prefix',
                            help='(string) S3 prefix for pre-trained model',
                            type=str,
                            default='sagemaker')
        parser.add_argument('--aws_region',
                            help='(string) AWS region',
                            type=str,
                            default=os.environ.get("AWS_REGION", "us-east-1"))

        args, unknown = parser.parse_known_args()
        start_redis_server()

        s3_client = SageS3Client(bucket=args.s3_bucket,
                                 s3_prefix=args.s3_prefix,
                                 aws_region=args.aws_region)

        # Load the model metadata
        model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                                 'model_metadata.json')
        load_model_metadata(s3_client, args.model_metadata_s3_key,
                            model_metadata_local_path)
        s3_client.upload_file(
            os.path.normpath("%s/model/model_metadata.json" % args.s3_prefix),
            model_metadata_local_path)
        shutil.copy2(model_metadata_local_path, SM_MODEL_OUTPUT_DIR)

        # Register the gym enviroment, this will give clients the ability to creat the enviroment object
        register(id=defaults.ENV_ID,
                 entry_point=defaults.ENTRY_POINT,
                 max_episode_steps=defaults.MAX_STEPS,
                 reward_threshold=defaults.THRESHOLD)

        user_batch_size, user_episode_per_rollout = None, None
        success_custom_preset = False
        if args.preset_s3_key:
            preset_local_path = "./markov/presets/preset.py"
            success_custom_preset = s3_client.download_file(
                s3_key=args.preset_s3_key, local_path=preset_local_path)
            if not success_custom_preset:
                logger.info(
                    "Could not download the preset file. Using the default DeepRacer preset."
                )
            else:
                preset_location = "markov.presets.preset:graph_manager"
                graph_manager = short_dynamic_import(preset_location,
                                                     ignore_module_case=True)
                success_custom_preset = s3_client.upload_file(
                    s3_key=os.path.normpath("%s/presets/preset.py" %
                                            args.s3_prefix),
                    local_path=preset_local_path)
                if success_custom_preset:
                    agent_param_loc = "markov.presets.preset:agent_params"
                    agent_params = short_dynamic_import(
                        agent_param_loc, ignore_module_case=True)
                    user_batch_size = agent_params.network_wrappers[
                        'main'].batch_size
                    user_episode_per_rollout = agent_params.algorithm.num_consecutive_playing_steps.num_steps
                    logger.info("Using preset: %s" % args.preset_s3_key)

        if not success_custom_preset:
            from markov.sagemaker_graph_manager import get_graph_manager
            user_batch_size = json.loads(
                robomaker_hyperparams_json)["batch_size"],
            user_episode_per_rollout = json.loads(
                robomaker_hyperparams_json)["num_episodes_between_training"]
            params_blob = os.environ.get('SM_TRAINING_ENV', '')
            if params_blob:
                params = json.loads(params_blob)
                sm_hyperparams_dict = params["hyperparameters"]
            else:
                sm_hyperparams_dict = {}
            graph_manager, robomaker_hyperparams_json = get_graph_manager(
                **sm_hyperparams_dict)
            s3_client.upload_hyperparameters(robomaker_hyperparams_json)
            logger.info("Uploaded hyperparameters.json to S3")

        host_ip_address = get_ip_from_host()
        s3_client.write_ip_config(host_ip_address)
        logger.info("Uploaded IP address information to S3: %s" %
                    host_ip_address)
        use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
        if use_pretrained_model:
            s3_client_pretrained = SageS3Client(
                bucket=args.pretrained_s3_bucket,
                s3_prefix=args.pretrained_s3_prefix,
                aws_region=args.aws_region)
            s3_client_pretrained.download_model(args.pretrained_checkpoint_dir)

        memory_backend_params = RedisPubSubMemoryBackendParameters(
            redis_address="localhost",
            redis_port=6379,
            run_type='trainer',
            channel=args.s3_prefix)

        ds_params_instance = S3BotoDataStoreParameters(
            bucket_name=args.s3_bucket,
            checkpoint_dir=args.checkpoint_dir,
            aws_region=args.aws_region,
            s3_folder=args.s3_prefix)
        graph_manager.data_store_params = ds_params_instance

        data_store = S3BotoDataStore(ds_params_instance)
        data_store.graph_manager = graph_manager
        graph_manager.data_store = data_store

        training_worker(graph_manager=graph_manager,
                        checkpoint_dir=args.checkpoint_dir,
                        use_pretrained_model=use_pretrained_model,
                        framework=args.framework,
                        memory_backend_params=memory_backend_params,
                        user_batch_size=user_batch_size,
                        user_episode_per_rollout=user_episode_per_rollout)
    except Exception as ex:
        utils.json_format_logger(
            "Training worker exited with exception: {}".format(ex),
            **utils.build_system_error_dict(
                utils.SIMAPP_TRAINING_WORKER_EXCEPTION,
                utils.SIMAPP_EVENT_ERROR_CODE_500))
        utils.simapp_exit_gracefully()
Esempio n. 7
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET",
                                               "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX",
                                               "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=os.environ.get("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=os.environ.get("MODEL_METADATA_FILE_S3_KEY",
                                               None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    print("S3 bucket: %s" % args.s3_bucket)
    print("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)

    redis_ip = s3_client.get_ip()
    print("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        print("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        print("SageMaker hyperparameters not found.")

    preset_file_success = False
    environment_file_success = False
    preset_file_success, environment_file_success = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    if not environment_file_success:
        # Download reward function if environment file is not downloaded
        if not args.reward_file_s3_key:
            raise ValueError("Customer reward S3 key not supplied!")
        download_customer_reward_function(s3_client, args.reward_file_s3_key)
        import markov.environments
        print("Using default environment!")
    else:
        register_custom_environments()
        print("Using custom environment!")

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        print("Using custom preset file!")
    else:
        from markov.sagemaker_graph_manager import get_graph_manager
        graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type='worker',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    rollout_worker(
        graph_manager=graph_manager,
        checkpoint_dir=args.checkpoint_dir,
        data_store=data_store,
        num_workers=args.num_workers,
    )
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET",
                                               "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_PREFIX",
                                               "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("APP_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=os.environ.get("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=os.environ.get("MODEL_METADATA_FILE_S3_KEY",
                                               None))
    parser.add_argument('--aws_endpoint_url',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_ENDPOINT_URL", None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region,
                             endpoint_url=args.aws_endpoint_url)
    logger.info("S3 bucket: %s" % args.s3_bucket)
    logger.info("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    load_model_metadata(s3_client, args.model_metadata_s3_key,
                        model_metadata_local_path)

    # Download reward function
    if not args.reward_file_s3_key:
        utils.json_format_logger(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            **utils.build_system_error_dict(
                utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
                utils.SIMAPP_EVENT_ERROR_CODE_500))
        traceback.print_exc()
        utils.simapp_exit_gracefully()
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    # Register the gym enviroment, this will give clients the ability to creat the enviroment object
    register(id=defaults.ENV_ID,
             entry_point=defaults.ENTRY_POINT,
             max_episode_steps=defaults.MAX_STEPS,
             reward_threshold=defaults.THRESHOLD)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        logger.info("SageMaker hyperparameters not found.")

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        from markov.sagemaker_graph_manager import get_graph_manager
        graph_manager, _ = get_graph_manager(**sm_hyperparams_dict)

    logger.info("Connecting to redis at %s:%d" % (redis_ip, args.redis_port))
    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type='worker',
        channel=args.s3_prefix)

    logger.info("Connecting to s3 boto data store at %s" %
                args.aws_endpoint_url)
    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        aws_region=args.aws_region,
        s3_folder=args.s3_prefix,
        aws_endpoint_url=args.aws_endpoint_url)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    rollout_worker(graph_manager=graph_manager,
                   checkpoint_dir=args.checkpoint_dir,
                   data_store=data_store,
                   num_workers=args.num_workers,
                   memory_backend_params=memory_backend_params)
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--markov-preset-file',
                        help="(string) Name of a preset file to run in Markov's preset directory.",
                        type=str,
                        default=os.environ.get("MARKOV_PRESET_FILE", "object_tracker.py"))
    parser.add_argument('-c', '--local-model-directory',
                        help='(string) Path to a folder containing a checkpoint to restore the model from.',
                        type=str,
                        default=os.environ.get("LOCAL_MODEL_DIRECTORY", "./checkpoint"))
    parser.add_argument('-n', '--num-rollout-workers',
                        help="(int) Number of workers for multi-process based agents, e.g. A3C",
                        default=os.environ.get("NUMBER_OF_ROLLOUT_WORKERS", 1),
                        type=int)
    parser.add_argument('--model-s3-bucket',
                        help='(string) S3 bucket where trained models are stored. It contains model checkpoints.',
                        type=str,
                        default=os.environ.get("MODEL_S3_BUCKET"))
    parser.add_argument('--model-s3-prefix',
                        help='(string) S3 prefix where trained models are stored. It contains model checkpoints.',
                        type=str,
                        default=os.environ.get("MODEL_S3_PREFIX"))
    parser.add_argument('--aws-region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("ROS_AWS_REGION", "us-west-2"))

    args = parser.parse_args()

    data_store_params_instance = S3BotoDataStoreParameters(bucket_name=args.model_s3_bucket,
                                                   s3_folder=args.model_s3_prefix,
                                                   checkpoint_dir=args.local_model_directory,
                                                   aws_region=args.aws_region)
    data_store = S3BotoDataStore(data_store_params_instance)

    # Get the IP of the trainer machine
    trainer_ip = data_store.get_ip()
    print("Received IP from SageMaker successfully: %s" % trainer_ip)

    preset_file_success = data_store.download_presets_if_present(PRESET_LOCAL_PATH)

    if preset_file_success:
        environment_file_success = data_store.download_environments_if_present(ENVIRONMENT_LOCAL_PATH)
        path_and_module = PRESET_LOCAL_PATH + args.markov_preset_file + ":graph_manager"
        graph_manager = short_dynamic_import(path_and_module, ignore_module_case=True)
        if environment_file_success:
            import robomaker.environments
        print("Using custom preset file!")
    elif args.markov_preset_file:
        markov_path = imp.find_module("markov")[1]
        preset_location = os.path.join(markov_path, "presets", args.markov_preset_file)
        path_and_module = preset_location + ":graph_manager"
        graph_manager = short_dynamic_import(path_and_module, ignore_module_case=True)
        print("Using custom preset file from Markov presets directory!")
    else:
        raise ValueError("Unable to determine preset file")

    memory_backend_params = RedisPubSubMemoryBackendParameters(redis_address=trainer_ip,
                                                               redis_port=TRAINER_REDIS_PORT,
                                                               run_type='worker',
                                                               channel=args.model_s3_prefix)
    graph_manager.agent_params.memory.register_var('memory_backend_params', memory_backend_params)
    graph_manager.data_store_params = data_store_params_instance
    graph_manager.data_store = data_store

    utils.wait_for_checkpoint(checkpoint_dir=args.local_model_directory, data_store=data_store)
    rollout_worker(
        graph_manager=graph_manager,
        checkpoint_dir=args.local_model_directory,
        data_store=data_store,
        num_workers=args.num_rollout_workers
    )
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint-dir',
        help=
        '(string) Path to a local folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained-checkpoint-dir',
        help=
        '(string) Path to a local folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--RLCOACH_PRESET',
                        help='(string) Default preset to use',
                        type=str,
                        default='object_tracker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        required=True)

    args, unknown = parser.parse_known_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)

    # Import to register the environment with Gym
    import robomaker.environments

    preset_location = "robomaker.presets.%s:graph_manager" % args.RLCOACH_PRESET
    graph_manager = short_dynamic_import(preset_location,
                                         ignore_module_case=True)

    host_ip_address = get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    print("Uploaded IP address information to S3: %s" % host_ip_address)

    use_pretrained_model = False
    if args.pretrained_s3_bucket and args.pretrained_s3_prefix:
        s3_client_pretrained = SageS3Client(
            bucket=args.pretrained_s3_bucket,
            s3_prefix=args.pretrained_s3_prefix,
            aws_region=args.aws_region)
        s3_client_pretrained.download_model(PRETRAINED_MODEL_DIR)
        use_pretrained_model = True

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type='trainer',
        channel=args.s3_prefix)

    graph_manager.agent_params.memory.register_var('memory_backend_params',
                                                   memory_backend_params)

    ds_params_instance = S3BotoDataStoreParameters(
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        s3_folder=args.s3_prefix,
        aws_region=args.aws_region)
    graph_manager.data_store_params = ds_params_instance

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    training_worker(graph_manager=graph_manager,
                    checkpoint_dir=args.checkpoint_dir,
                    use_pretrained_model=use_pretrained_model,
                    framework=args.framework)
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument('--s3_endpoint_url',
                        help='(string) S3 endpoint URL',
                        type=str,
                        default=rospy.get_param("S3_ENDPOINT_URL", None))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))
    # For training job, reset is not allowed. penalty_seconds, off_track_penalty, and
    # collision_penalty will all be 0 be default
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 0.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "TRAINING"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      0.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      0.0)))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region,
                             s3_endpoint_url=args.s3_endpoint_url)
    logger.info("S3 bucket: %s", args.s3_bucket)
    logger.info("S3 prefix: %s", args.s3_prefix)
    logger.info("S3 endpoint URL: %s" % args.s3_endpoint_url)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket, args.s3_prefix),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        log_and_exit("Failed to import user's reward_function: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera(namespaces=['racecar'])

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        ConfigParams.CAR_CTRL_CONFIG.value: {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version,
            ConfigParams.NUMBER_OF_RESETS.value:
            args.number_of_resets,
            ConfigParams.PENALTY_SECONDS.value:
            args.penalty_seconds,
            ConfigParams.NUMBER_OF_TRIALS.value:
            None,
            ConfigParams.IS_CONTINUOUS.value:
            args.is_continuous,
            ConfigParams.RACE_TYPE.value:
            args.race_type,
            ConfigParams.COLLISION_PENALTY.value:
            args.collision_penalty,
            ConfigParams.OFF_TRACK_PENALTY.value:
            args.off_track_penalty
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.ENDPOINT_URL.value:
        rospy.get_param('S3_ENDPOINT_URL', None),
        MetricsS3Keys.REGION.value:
        rospy.get_param('AWS_REGION'),
        MetricsS3Keys.STEP_BUCKET.value:
        rospy.get_param('SAGEMAKER_SHARED_S3_BUCKET'),
        MetricsS3Keys.STEP_KEY.value:
        os.path.join(rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX'),
                     TRAINING_SIMTRACE_DATA_S3_OBJECT_KEY)
    }
    metrics_s3_model_cfg = {
        MetricsS3Keys.METRICS_BUCKET.value:
        args.s3_bucket,
        MetricsS3Keys.METRICS_KEY.value:
        os.path.join(args.s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX),
        MetricsS3Keys.REGION.value:
        args.aws_region
    }
    run_phase_subject = RunPhaseSubject()

    agent_list = list()
    agent_list.append(
        create_rollout_agent(
            agent_config,
            TrainingMetrics('agent', metrics_s3_config, metrics_s3_model_cfg,
                            args.checkpoint_dir, run_phase_subject),
            run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())
    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)

    aws_region = rospy.get_param('AWS_REGION', args.aws_region)
    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None)
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')

    s3_writer_job_info = []
    if simtrace_s3_bucket:
        s3_writer_job_info.append(
            IterationData(
                'simtrace', simtrace_s3_bucket, simtrace_s3_object_prefix,
                aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.SIM_TRACE_TRAINING_LOCAL_FILE.
                    value)))
    if mp4_s3_bucket:
        s3_writer_job_info.extend([
            IterationData(
                'pip', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_PIP_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                '45degree', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_45DEGREE_MP4_VALIDATION_LOCAL_PATH.value)),
            IterationData(
                'topview', mp4_s3_bucket, mp4_s3_object_prefix, aws_region,
                os.path.join(
                    ITERATION_DATA_LOCAL_FILE_PATH, 'agent',
                    IterationDataLocalFileNames.
                    CAMERA_TOPVIEW_MP4_VALIDATION_LOCAL_PATH.value))
        ])

    s3_writer = S3Writer(job_info=s3_writer_job_info,
                         s3_endpoint_url=args.s3_endpoint_url)

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s", redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as filepointer:
            sm_hyperparams_dict = json.load(filepointer)
    else:
        logger.info("SageMaker hyperparameters not found.")

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=run_phase_subject)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_names={'agent': args.s3_bucket},
        base_checkpoint_dir=args.checkpoint_dir,
        s3_folders={'agent': args.s3_prefix},
        s3_endpoint_url=args.s3_endpoint_url)

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   num_workers=args.num_workers,
                   task_parameters=task_parameters,
                   s3_writer=s3_writer)
Esempio n. 12
0
def main():
    screen.set_use_colors(False)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_BUCKET",
                                                "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default=rospy.get_param("SAGEMAKER_SHARED_S3_PREFIX",
                                                "sagemaker"))
    parser.add_argument(
        '--num-workers',
        help="(int) The number of workers started in this pool",
        type=int,
        default=1)
    parser.add_argument('-r',
                        '--redis_ip',
                        help="(string) IP or host for the redis server",
                        default='localhost',
                        type=str)
    parser.add_argument('-rp',
                        '--redis_port',
                        help="(int) Port of the redis server",
                        default=6379,
                        type=int)
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--reward_file_s3_key',
                        help='(string) Reward File S3 Key',
                        type=str,
                        default=rospy.get_param("REWARD_FILE_S3_KEY", None))
    parser.add_argument('--model_metadata_s3_key',
                        help='(string) Model Metadata File S3 Key',
                        type=str,
                        default=rospy.get_param("MODEL_METADATA_FILE_S3_KEY",
                                                None))

    args = parser.parse_args()

    s3_client = SageS3Client(bucket=args.s3_bucket,
                             s3_prefix=args.s3_prefix,
                             aws_region=args.aws_region)
    logger.info("S3 bucket: %s" % args.s3_bucket)
    logger.info("S3 prefix: %s" % args.s3_prefix)

    # Load the model metadata
    model_metadata_local_path = os.path.join(CUSTOM_FILES_PATH,
                                             'model_metadata.json')
    utils.load_model_metadata(s3_client, args.model_metadata_s3_key,
                              model_metadata_local_path)

    # Download and import reward function
    if not args.reward_file_s3_key:
        utils.log_and_exit(
            "Reward function code S3 key not available for S3 bucket {} and prefix {}"
            .format(args.s3_bucket,
                    args.s3_prefix), utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_500)
    download_customer_reward_function(s3_client, args.reward_file_s3_key)

    try:
        from custom_files.customer_reward_function import reward_function
    except Exception as e:
        utils.log_and_exit(
            "Failed to import user's reward_function: {}".format(e),
            utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
            utils.SIMAPP_EVENT_ERROR_CODE_400)

    # Instantiate Cameras
    configure_camera()

    redis_ip = s3_client.get_ip()
    logger.info("Received IP from SageMaker successfully: %s" % redis_ip)

    # Download hyperparameters from SageMaker
    hyperparameters_file_success = False
    hyperparams_s3_key = os.path.normpath(args.s3_prefix +
                                          "/ip/hyperparameters.json")
    hyperparameters_file_success = s3_client.download_file(
        s3_key=hyperparams_s3_key, local_path="hyperparameters.json")
    sm_hyperparams_dict = {}
    if hyperparameters_file_success:
        logger.info("Received Sagemaker hyperparameters successfully!")
        with open("hyperparameters.json") as fp:
            sm_hyperparams_dict = json.load(fp)
    else:
        logger.info("SageMaker hyperparameters not found.")

    preset_file_success, _ = download_custom_files_if_present(
        s3_client, args.s3_prefix)

    #! TODO each agent should have own config
    _, _, version = utils_parse_model_metadata.parse_model_metadata(
        model_metadata_local_path)
    agent_config = {
        'model_metadata': model_metadata_local_path,
        'car_ctrl_cnfig': {
            ConfigParams.LINK_NAME_LIST.value:
            LINK_NAMES,
            ConfigParams.VELOCITY_LIST.value:
            VELOCITY_TOPICS,
            ConfigParams.STEERING_LIST.value:
            STEERING_TOPICS,
            ConfigParams.CHANGE_START.value:
            utils.str2bool(rospy.get_param('CHANGE_START_POSITION', True)),
            ConfigParams.ALT_DIR.value:
            utils.str2bool(
                rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
            ConfigParams.ACTION_SPACE_PATH.value:
            'custom_files/model_metadata.json',
            ConfigParams.REWARD.value:
            reward_function,
            ConfigParams.AGENT_NAME.value:
            'racecar',
            ConfigParams.VERSION.value:
            version
        }
    }

    #! TODO each agent should have own s3 bucket
    metrics_s3_config = {
        MetricsS3Keys.METRICS_BUCKET.value:
        rospy.get_param('METRICS_S3_BUCKET'),
        MetricsS3Keys.METRICS_KEY.value:
        rospy.get_param('METRICS_S3_OBJECT_KEY'),
        MetricsS3Keys.REGION.value:
        rospy.get_param('AWS_REGION'),
        MetricsS3Keys.STEP_BUCKET.value:
        rospy.get_param('SAGEMAKER_SHARED_S3_BUCKET'),
        MetricsS3Keys.STEP_KEY.value:
        os.path.join(rospy.get_param('SAGEMAKER_SHARED_S3_PREFIX'),
                     TRAINING_SIMTRACE_DATA_S3_OBJECT_KEY)
    }

    agent_list = list()
    agent_list.append(
        create_rollout_agent(agent_config, TrainingMetrics(metrics_s3_config)))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    if preset_file_success:
        preset_location = os.path.join(CUSTOM_FILES_PATH, "preset.py")
        preset_location += ":graph_manager"
        graph_manager = short_dynamic_import(preset_location,
                                             ignore_module_case=True)
        logger.info("Using custom preset file!")
    else:
        graph_manager, _ = get_graph_manager(sm_hyperparams_dict, agent_list)

    memory_backend_params = RedisPubSubMemoryBackendParameters(
        redis_address=redis_ip,
        redis_port=6379,
        run_type=str(RunType.ROLLOUT_WORKER),
        channel=args.s3_prefix)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(
        aws_region=args.aws_region,
        bucket_name=args.s3_bucket,
        checkpoint_dir=args.checkpoint_dir,
        s3_folder=args.s3_prefix)

    data_store = S3BotoDataStore(ds_params_instance)
    data_store.graph_manager = graph_manager
    graph_manager.data_store = data_store

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.checkpoint_dir

    rollout_worker(graph_manager=graph_manager,
                   data_store=data_store,
                   num_workers=args.num_workers,
                   task_parameters=task_parameters)