Ejemplo n.º 1
0
def handle_distributed_coach_orchestrator(graph_manager, args):
    ckpt_inside_container = "/checkpoint"
    rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:]
    trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:]

    if '--experiment_name' not in rollout_command:
        rollout_command = rollout_command + ['--experiment_name', args.experiment_name]

    if '--experiment_name' not in trainer_command:
        trainer_command = trainer_command + ['--experiment_name', args.experiment_name]

    memory_backend_params = None
    if args.memory_backend == "redispubsub":
        memory_backend_params = RedisPubSubMemoryBackendParameters()

    ds_params_instance = None
    if args.data_store == "s3":
        ds_params = DataStoreParameters("s3", "", "")
        ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name,
                                                   creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container)
    elif args.data_store == "nfs":
        ds_params = DataStoreParameters("nfs", "kubernetes", "")
        ds_params_instance = NFSDataStoreParameters(ds_params)

    worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers)
    trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER))

    orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params],
                                                kubeconfig='~/.kube/config',
                                                memory_backend_parameters=memory_backend_params,
                                                data_store_params=ds_params_instance)
    orchestrator = Kubernetes(orchestration_params)
    if not orchestrator.setup():
        print("Could not setup.")
        return

    if orchestrator.deploy_trainer():
        print("Successfully deployed trainer.")
    else:
        print("Could not deploy trainer.")
        return

    if orchestrator.deploy_worker():
        print("Successfully deployed rollout worker(s).")
    else:
        print("Could not deploy rollout worker(s).")
        return

    try:
        orchestrator.trainer_logs()
    except KeyboardInterrupt:
        pass

    orchestrator.undeploy()
Ejemplo n.º 2
0
def handle_distributed_coach_orchestrator(args):
    from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, \
        RunTypeParameters

    ckpt_inside_container = "/checkpoint"
    arg_list = sys.argv[1:]
    try:
        i = arg_list.index('--distributed_coach_run_type')
        arg_list.pop(i)
        arg_list.pop(i)
    except ValueError:
        pass

    trainer_command = [
        'python3', 'rl_coach/coach.py', '--distributed_coach_run_type',
        str(RunType.TRAINER)
    ] + arg_list
    rollout_command = [
        'python3', 'rl_coach/coach.py', '--distributed_coach_run_type',
        str(RunType.ROLLOUT_WORKER)
    ] + arg_list

    if '--experiment_name' not in rollout_command:
        rollout_command = rollout_command + [
            '--experiment_name', args.experiment_name
        ]

    if '--experiment_name' not in trainer_command:
        trainer_command = trainer_command + [
            '--experiment_name', args.experiment_name
        ]

    memory_backend_params = None
    if args.memory_backend == "redispubsub":
        memory_backend_params = RedisPubSubMemoryBackendParameters()

    ds_params_instance = None
    if args.data_store == "s3":
        ds_params = DataStoreParameters("s3", "", "")
        ds_params_instance = S3DataStoreParameters(
            ds_params=ds_params,
            end_point=args.s3_end_point,
            bucket_name=args.s3_bucket_name,
            creds_file=args.s3_creds_file,
            checkpoint_dir=ckpt_inside_container,
            expt_dir=args.experiment_path)
    elif args.data_store == "nfs":
        ds_params = DataStoreParameters("nfs", "kubernetes", "")
        ds_params_instance = NFSDataStoreParameters(ds_params)

    worker_run_type_params = RunTypeParameters(args.image,
                                               rollout_command,
                                               run_type=str(
                                                   RunType.ROLLOUT_WORKER),
                                               num_replicas=args.num_workers)
    trainer_run_type_params = RunTypeParameters(args.image,
                                                trainer_command,
                                                run_type=str(RunType.TRAINER))

    orchestration_params = KubernetesParameters(
        [worker_run_type_params, trainer_run_type_params],
        kubeconfig='~/.kube/config',
        memory_backend_parameters=memory_backend_params,
        data_store_params=ds_params_instance)
    orchestrator = Kubernetes(orchestration_params)
    if not orchestrator.setup():
        print("Could not setup.")
        return 1

    if orchestrator.deploy_trainer():
        print("Successfully deployed trainer.")
    else:
        print("Could not deploy trainer.")
        return 1

    if orchestrator.deploy_worker():
        print("Successfully deployed rollout worker(s).")
    else:
        print("Could not deploy rollout worker(s).")
        return 1

    if args.dump_worker_logs:
        screen.log_title("Dumping rollout worker logs in: {}".format(
            args.experiment_path))
        orchestrator.worker_logs(path=args.experiment_path)

    exit_code = 1
    try:
        exit_code = orchestrator.trainer_logs()
    except KeyboardInterrupt:
        pass

    orchestrator.undeploy()
    return exit_code