Example #1
0
def train(logdir, env, expert_path, seed, batch_size, lr, traj_limitation):
    env_id = env
    logdir = logdir + '/bc/' + env_id + '/s-{}/l-{}-b-{}/seed-{}'.format(
        traj_limitation, lr, batch_size, seed)
    print(logdir, env, expert_path, seed)
    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])
    expert = MADataSet(expert_path,
                       ret_threshold=-10,
                       traj_limitation=traj_limitation)

    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True)

    policy_fn = CategoricalPolicy
    learn(policy_fn, env, expert, seed, int(2e7), batch_size=batch_size, lr=lr)
Example #2
0
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed,
          num_cpu):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    policy_fn = CategoricalPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.00,
          identical=make_env.get_identical(env_id))
    env.close()
Example #3
0
def train(logdir,
          env_id,
          num_timesteps,
          lr,
          timesteps_per_batch,
          seed,
          num_cpu,
          expert_path,
          traj_limitation,
          ret_threshold,
          dis_lr,
          disc_type='decentralized',
          bc_iters=500,
          l2=0.1,
          d_iters=1,
          rew_scale=0.1):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    print(num_cpu)
    policy_fn = CategoricalPolicy
    expert = MADataSet(expert_path,
                       ret_threshold=ret_threshold,
                       traj_limitation=traj_limitation,
                       nobs_flag=True)
    learn(policy_fn,
          expert,
          env,
          env_id,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.0,
          dis_lr=dis_lr,
          disc_type=disc_type,
          bc_iters=bc_iters,
          identical=make_env.get_identical(env_id),
          l2=l2,
          d_iters=d_iters,
          rew_scale=rew_scale)
    env.close()
def main():
    parser = mujoco_arg_parser()
    parser.add_argument('--cpu', type=int, default=1)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--batch', type=int, default=2048)
    args = parser.parse_args()
    logdir = './results/mappo/' + args.env + '/l-{}-b-{}/seed-{}'.format(args.lr, args.batch, args.seed)
    try:
        logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
    except:
        logger.configure()
    train(args.env, num_timesteps=1e7, seed=args.seed, num_cpu=args.cpu, batch=args.batch, lr=args.lr)
Example #5
0
def train(logdir,
          env_id,
          lr,
          num_timesteps,
          seed,
          timesteps_per_batch,
          cont=False):
    from sandbox.ppo_sgd import mlp_policy
    from sandbox.ppo_sgd import pposgd_simple
    from rl import logger
    from rl.common import set_global_seeds, tf_util as U
    from rl import bench

    from gym.envs.registration import register
    import multiagent
    import make_env

    logger.configure(logdir, format_strs=['log', 'json', 'tensorboard'])
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = make_env.make_env(env_id)

    def policy_fn(name, ob_space, ac_space, id):
        pi = mlp_policy.MlpPolicy(name=name,
                                  ob_space=ob_space,
                                  ac_space=ac_space,
                                  hid_size=64,
                                  num_hid_layers=2,
                                  id=id)
        return pi

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=timesteps_per_batch,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=lr,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        cont=cont)
    env.close()
    return None
Example #6
0
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed,
          num_cpu, max_episode_len):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id, max_episode_len=max_episode_len)
            env.discrete_action_input = True
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir, format_strs=['json'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    policy_fn = CategoricalPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.00,
          identical=make_env.get_identical(env_id),
          log_interval=50,
          save_interval=int(num_timesteps / timesteps_per_batch),
          max_episode_len=max_episode_len)
    logger.Logger.CURRENT.close()
    env.close()
Example #7
0
def train(logdir, env, expert_path, seed, max_episode_len):
    print(logdir, env, expert_path, seed, max_episode_len)
    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])
    expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=200)
    env_id = env

    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id, max_episode_len=max_episode_len)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True)

    policy_fn = CategoricalPolicy
    learn(policy_fn, env, expert, seed)
def train_trpo(game, num_timesteps, eval_episodes, seed, horizon, out_dir='.', load_path=None, checkpoint_path_in=None,
               gamma=0.99, timesteps_per_batch=500, num_layers=0, num_hidden=32, checkpoint_freq=20, max_kl=0.01):
    start_time = time.time()
    clip = None
    dir = 'game'
    game_params = {}

    # Accept custom grid if the environment requires it
    if game == 'Taxi' or game == 'TaxiEasy':
        game_params['grid'] = args.grid
        game_params['box'] = True
    if game in ['RaceStrategy-v0', 'Cliff-v0']:
        game_params['horizon'] = horizon

    # env = Race(gamma=gamma, horizon=horizon, )
    # env_eval = Race(gamma=gamma, horizon=horizon)
    env = make_game(args.game, game_params)
    env_eval = make_game(args.game, game_params)
    directory_output = (dir + '/trpo_' + str(num_layers) + '_'+ str(num_hidden) + '_'+  str(max_kl) + '/')

    def eval_policy_closure(**args):
        return eval_policy(env=env_eval, gamma=gamma, **args)

    tf.set_random_seed(seed)
    sess = U.single_threaded_session()
    sess.__enter__()
    rank = MPI.COMM_WORLD.Get_rank()
    time_str = str(start_time)
    if rank == 0:
        logger.configure(dir=out_dir + '/' + directory_output + '/logs',
                         format_strs=['stdout', 'csv'], suffix=time_str)
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)

    network = mlp(num_hidden=num_hidden, num_layers=num_layers)


    optimized_policy = trpo_mpi.learn(network=network, env=env, eval_policy=eval_policy_closure, timesteps_per_batch=timesteps_per_batch,
                   max_kl=max_kl, cg_iters=10, cg_damping=1e-3,
                   total_timesteps=num_timesteps, gamma=gamma, lam=1.0, vf_iters=3, vf_stepsize=1e-4,
                   checkpoint_freq=checkpoint_freq,
                   checkpoint_dir_out=out_dir + '/' + directory_output + '/models/' + time_str + '/',
                   load_path=load_path, checkpoint_path_in=checkpoint_path_in,
                   eval_episodes=eval_episodes,
                   init_std=1,
                   trainable_variance=True,
                   trainable_bias=True,
                   clip=clip)

    s = env.reset()
    done = False

    states = []
    actions = []
    s = 0
    delta_state = 0.2
    while s < env.dim[0]:
        a, _, _, _ = optimized_policy.step([s])
        states.append(s)
        actions.append(a[0])
        s += delta_state
    s = env.reset()
    plt.plot(states, actions)
    plt.show()
    print('TOTAL TIME:', time.time() - start_time)
    print("Time taken: %f seg" % ((time.time() - start_time)))
    print("Time taken: %f hours" % ((time.time() - start_time) / 3600))

    env.close()