def train_copos(args):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(args.log_path)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(args.log_path, format_strs=[])

    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return CompatibleMlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hid_size=32,
                                   num_hid_layers=2)

    set_global_seeds(workerseed)
    env = build_env(args, normalize_ob=True)
    #env = gym.make(args.env)
    #env.seed(workerseed)

    timesteps_per_batch = 10000
    #timesteps_per_batch=2048
    beta = -1
    if beta < 0:
        nr_episodes = int(args.num_timesteps) // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space)

        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))
    copos_mpi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    epsilon=0.01,
                    beta=beta,
                    cg_iters=10,
                    cg_damping=0.1,
                    max_timesteps=int(args.num_timesteps),
                    gamma=0.99,
                    lam=0.98,
                    vf_iters=5,
                    vf_stepsize=1e-3)
    env.close()
Ejemplo n.º 2
0
def main(args):
    # configure logger, disable logging in child MPI processes (with rank > 0)

    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()
    extra_args = parse_cmdline_kwargs(unknown_args)
    print(args)

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        #rank = 0
        #logger.configure()
        #logger.configure(dir=extra_args['logdir'])
        rank = 0
        configure_logger(args.log_path)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(args.log_path, format_strs=[])

    model, env = train(args, extra_args)

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("Running trained model")
        obs = env.reset()

        state = model.initial_state if hasattr(model,
                                               'initial_state') else None
        dones = np.zeros((1, ))

        episode_rew = 0
        while True:
            if state is not None:
                actions, _, state, _ = model.step(obs, S=state, M=dones)
            else:
                actions, _, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(actions)
            episode_rew += rew[0] if isinstance(env, VecEnv) else rew
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done
            if done:
                print('episode_rew={}'.format(episode_rew))
                episode_rew = 0
                obs = env.reset()
    env.close()
    return model
Ejemplo n.º 3
0
def nm_main(env, env_type, seed, alg, num_timesteps, network, gamestate,
            num_env, reward_scale, save_path, save_video_interval,
            save_video_length, play, log_path, env_args, alg_args):

    bl_args = bl_arg_class()
    bl_args.env = env
    bl_args.env_type = env_type
    bl_args.seed = seed
    bl_args.alg = alg
    bl_args.num_timesteps = num_timesteps
    bl_args.network = network
    bl_args.gamestate = gamestate
    bl_args.num_env = num_env
    bl_args.reward_scale = reward_scale
    bl_args.save_path = save_path
    bl_args.save_video_interval = save_video_interval
    bl_args.save_video_length = save_video_length
    bl_args.log_path = log_path
    bl_args.play = play
    bl_args.env_args = env_args

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        blr.configure_logger(bl_args.log_path)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        blr.configure_logger(bl_args.log_path, format_strs=[])

    model, env = blr.train(bl_args, alg_args)

    #if save_path is not None and rank == 0:
    #    save_path = os.path.expanduser(save_path)
    #    model.save(save_path)

    env.close()

    print('\nDie allerallerallerletzte Zeile...\n')
Ejemplo n.º 4
0
def main(args):
    # print("\n\n\n\n\nXXX")
    # print(sys.path)
    # import baselines
    # print(baselines.__file__())
    # for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']:
    #     if varname in os.environ:
    #         print(varname, int(os.environ[varname]))
    # print("parsing args...")

    arg_parser = init_arg_parser()
    args, unknown_args = arg_parser.parse_known_args(args)

    # if args.num_cpu > 1:
    if args.allow_run_as_root:
        whoami = mpi_fork_run_as_root(args.num_cpu,
                                      bind_to_core=args.bind_to_core)
    else:
        whoami = mpi_fork(args.num_cpu, bind_to_core=args.bind_to_core)
    if whoami == 'parent':
        print('parent exiting with code 0...')
        sys.exit(0)

    U.single_threaded_session().__enter__()

    rank = MPI.COMM_WORLD.Get_rank()

    # assert MPI.COMM_WORLD.Get_size() == args.num_cpu, MPI.COMM_WORLD.Get_size()

    # configure logger
    # rank = MPI.COMM_WORLD.Get_rank()  # FIXME: how to log when rank != 0??
    # if rank == 0:
    configure_logger(args.log_path, format_strs=[])
    logger.info(f"main: {rank} / {MPI.COMM_WORLD.Get_size()}")
    logger.info(f"logger dir: {logger.get_dir()}")

    extra_args = parse_cmdline_kwargs(unknown_args)
    logger.info(args, extra_args)

    # else:
    #     configure_logger(log_path=None)  # or still args.log_path?

    # raise RuntimeError(f"tf session: {tf.get_default_session()}, {MPI.COMM_WORLD.Get_rank()} / {MPI.COMM_WORLD.Get_size()}")

    def make_wrapped_env():
        env = gym.make(args.env)
        if args.env_type == 'maze':
            pass
        elif args.env_type == 'robotics':
            from baselines.envs.goal_sampler_env_wrapper import GoalSamplerEnvWrapper
            env = GoalSamplerEnvWrapper(env)
        elif args.env_type == 'ant':
            env = GoalExplorationEnv(env=env,
                                     only_feasible=True,
                                     extend_dist_rew=0,
                                     inner_weight=0,
                                     goal_weight=1)
        else:
            raise NotImplementedError(args.env_type)
        # FIXME: if resample space is feasible, can set only_feasible = False to avoid unnecessary computation
        return env

    venv_kwargs = dict(
        make_wrapped_env=make_wrapped_env,
        seed=args.seed,
        reward_scale=args.reward_scale,
        flatten_dict_observations=False,
        mpi_rank=rank,
        monitor_log_dir=args.log_path,  # FIXME
    )
    venv = make_vec_env(num_env=args.num_env, **venv_kwargs)
    eval_venv = make_vec_env(num_env=args.num_env, **venv_kwargs)
    if args.debug:
        plotter_venv = make_vec_env(num_env=1, **venv_kwargs)
    else:
        plotter_venv = None

    # Seed everything.
    rank_seed = args.seed + 1000000 * rank if args.seed is not None else None
    set_global_seeds(rank_seed)
    logger.info(f'setting global rank: {rank_seed} ')

    # Prepare params.
    params = dict()
    params.update(config.DEFAULT_PARAMS)
    params.update(config.DEFAULT_ENV_PARAMS[args.env])
    params.update(**extra_args)  # makes it possible to override any parameter

    # if args.debug:
    #     params['n_cycles'] = 2
    #     params['n_batches'] = 2
    #     params['ve_n_batches'] = 2
    #     params['size_ensemble'] = 2

    # env settings
    params['env_name'] = args.env
    params['num_cpu'] = args.num_cpu
    params['rollout_batch_size'] = args.num_env
    params['timesteps_per_cpu'] = int(args.num_timesteps)

    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    params['make_env'] = make_wrapped_env

    learn_fun_return = learn(
        venv=venv,
        eval_venv=eval_venv,
        plotter_venv=plotter_venv,
        params=params,
        save_path=args.log_path,
        save_interval=args.save_interval,
    )

    if rank == 0:
        save_path = os.path.expanduser(logger.get_dir())
        for k, v in learn_fun_return.items():
            v.save(os.path.join(save_path, f"final-{k}.joblib"))

    venv.close()
    eval_venv.close()
    if plotter_venv is not None:
        plotter_venv.close()