def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
    savedir_fname = learn(env,
                          policy_fn,
                          dataset,
                          max_iters=args.BC_max_iter,
                          ckpt_dir=args.checkpoint_dir,
                          log_dir=args.log_dir,
                          task_name=task_name,
                          verbose=True)
    avg_len, avg_ret = runner(env,
                              policy_fn,
                              savedir_fname,
                              timesteps_per_batch=1024,
                              number_trajs=10,
                              stochastic_policy=args.stochastic_policy,
                              save=args.save_sample,
                              reuse=True)
Beispiel #2
0
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1):
    print("Running Acer Simple")
    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes) # HACK
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()
    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()
    def make_env():
        env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env
    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=3e-4,
        cliprange=0.2,
        total_timesteps=num_timesteps)
Beispiel #4
0
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes) # HACK
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)

    nbatch = nenvs*nsteps
    tstart = time.time()
    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    env.close()
Beispiel #5
0
def make_vec_env(env_id, env_type, num_env, seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()
    def make_thunk(rank):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'):

  def make_env(rank):

    def _thunk():
      if env_id == "TestEnv":
        env = TestEnv(renderer=renderer)  #gym.make(env_id)
      else:
        env = gym.make(env_id)
      env.seed(seed + rank)
      env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
      gym.logger.setLevel(logging.WARN)
      # only clip rewards when not evaluating
      return env

    return _thunk

  set_global_seeds(seed)
  env = SubprocVecEnv([make_env(i) for i in range(num_env)])

  env.reset()
  start = time.time()
  for i in range(num_timesteps):
    action = [env.action_space.sample() for _ in range(num_env)]
    env.step(action)
  stop = time.time()
  duration = (stop - start)
  if (duration):
    fps = num_timesteps / duration
  else:
    fps = 0
  env.close()
  return num_env, fps
Beispiel #7
0
def train(env, seed, policy_fn, reward_giver, dataset, algo,
          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
          checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):

    pretrained_weight = None
    if pretrained and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.gail import behavior_clone
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
                                                 max_iters=BC_max_iter)

    if algo == 'trpo':
        from baselines.gail import trpo_mpi
        # Set up for MPI seed
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
                       pretrained=pretrained, pretrained_weight=pretrained_weight,
                       g_step=g_step, d_step=d_step,
                       entcoeff=policy_entcoeff,
                       max_timesteps=num_timesteps,
                       ckpt_dir=checkpoint_dir, log_dir=log_dir,
                       save_per_iter=save_per_iter,
                       timesteps_per_batch=1024,
                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
                       gamma=0.995, lam=0.97,
                       vf_iters=5, vf_stepsize=1e-3,
                       task_name=task_name)
    else:
        raise NotImplementedError
Beispiel #8
0
def train(env_id, num_frames, seed):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)


    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json"%rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
Beispiel #9
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
    set_global_seeds(workerseed)
    env = make_atari(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
def train(num_timesteps, seed):
    num_cpus = 1
    num_casks = 1
    num_cpus += num_casks

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=num_cpus,
                            inter_op_parallelism_threads=num_cpus)
    tf.Session(config=config).__enter__()

    gamma = 0.995

    env = RemoteVecEnv([make_env] * num_cpus)
    env = VecNormalize(env, ret=True, gamma=gamma)

    set_global_seeds(seed)
    policy = policies.MlpPolicy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=128,
               nminibatches=num_cpus-num_casks,
               lam=0.95,
               gamma=gamma,
               noptepochs=4,
               log_interval=1,
               vf_coef=0.5,
               ent_coef=0.0,
               lr=3e-4,
               cliprange=0.2,
               save_interval=2,
               load_path="./logs/course_6/00244",
               total_timesteps=num_timesteps,
               num_casks=num_casks)
Beispiel #11
0
def train(env_id, num_frames, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps = int(num_frames / 4 * 1.1)
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Beispiel #12
0
def make_mujoco_env(env_id, seed):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = Monitor(env, logger.get_dir())
    env.seed(seed)
    return env
Beispiel #13
0
def make_mujoco_env(env_id, seed):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    rank = MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed + 10000 * rank)
    env = gym.make(env_id)
    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
    env.seed(seed)
    return env
Beispiel #14
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
    env.seed(seed)
    return env
Beispiel #15
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    print('Evaluating {}'.format(args.env))
    bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
                          args.stochastic_policy, False, 'BC')
    print('Evaluation for {}'.format(args.env))
    print(bc_log)
    gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
                            args.stochastic_policy, True, 'gail')
    print('Evaluation for {}'.format(args.env))
    print(gail_log)
    plot(args.env, bc_log, gail_log, args.stochastic_policy)
Beispiel #16
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    def make_env(rank): # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return wrap_deepmind(env, **wrapper_kwargs)
        return _thunk
    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Beispiel #17
0
def make_mujoco_env(env_id, seed, reward_scale=1.0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    rank = MPI.COMM_WORLD.Get_rank()
    myseed = seed  + 1000 * rank if seed is not None else None
    set_global_seeds(myseed)
    env = gym.make(env_id)
    logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank))
    env = Monitor(env, logger_path, allow_early_resets=True)
    env.seed(seed)
    if reward_scale != 1.0:
        from baselines.common.retro_wrappers import RewardScaler
        env = RewardScaler(env, reward_scale)
    return env
Beispiel #18
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    else:
        raise NotImplementedError
    env.close()
Beispiel #19
0
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear'):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    env.close()
Beispiel #20
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
    env.close()
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_path=args.checkpoint_path,
    )

    env.close()
Beispiel #22
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env, policy_fn, 
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
Beispiel #23
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]
    
    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Beispiel #24
0
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    # divide by 4 due to frameskip, then do a little extras so episodes end
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and 
                os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
    env.close()
Beispiel #25
0
def train(env_id, num_timesteps, seed):
    env=gym.make(env_id)
    if logger.get_dir():
        env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
    set_global_seeds(seed)
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    with tf.Session(config=tf.ConfigProto()) as session:
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env, policy=policy, vf=vf,
            gamma=0.99, lam=0.97, timesteps_per_batch=2500,
            desired_kl=0.002,
            num_timesteps=num_timesteps, animate=False)

        env.close()
Beispiel #26
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
            hid_size=32, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
Beispiel #27
0
def test(args):
    import filelock
    with filelock.FileLock('/tmp/robotstify.lock'):
        import gym
        import sys
        try:
            import goexplore_py.complex_fetch_env
        except Exception:
            print('Could not import complex_fetch_env, is goexplore_py in PYTHONPATH?')

    import tensorflow as tf
    import horovod.tensorflow as hvd
    hvd.init()
    print('initialized worker %d' % hvd.rank(), flush=True)
    from baselines.common import set_global_seeds
    set_global_seeds(hvd.rank())
    from baselines import bench
    from baselines.common import set_global_seeds
    from atari_reset.wrappers import VecFrameStack, VideoWriter, my_wrapper,\
        EpsGreedyEnv, StickyActionEnv, NoopResetEnv, SubprocVecEnv, PreventSlugEnv, FetchSaveEnv, TanhWrap
    from atari_reset.ppo import learn
    from atari_reset.policies import CnnPolicy, GRUPolicy, FFPolicy

    set_global_seeds(hvd.rank())
    ncpu = 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.Session(config=config).__enter__()

    max_noops = 30 if args.noops else 0
    print('SAVE PATH', args.save_path)

    def make_env(rank):
        def env_fn():
            if args.game == 'fetch':
                assert args.fetch_target_location is not None, 'For now, we require a target location for fetch'
                kwargs = {}
                dargs = vars(args)
                for attr in dargs:
                    if attr.startswith('fetch_'):
                        if attr == 'fetch_type':
                            kwargs['model_file'] = f'teleOp_{args.fetch_type}.xml'
                        elif attr != 'fetch_total_timestep':
                            kwargs[attr[len('fetch_'):]] = dargs[attr]

                env = goexplore_py.complex_fetch_env.ComplexFetchEnv(
                    **kwargs
                )
            elif args.game == 'fetch_dumb':
                env = goexplore_py.dumb_fetch_env.ComplexFetchEnv()
            else:
                env = gym.make(args.game + 'NoFrameskip-v4')
                if args.seed_env:
                    env.seed(0)
                # if args.unlimited_score:
                #     # This removes the TimeLimit wrapper around the env
                #     env = env.env
                # env = PreventSlugEnv(env)
            # change for long runs
            # env._max_episode_steps *= 1000
            env = bench.Monitor(env, "{}.monitor.json".format(rank), allow_early_resets=True)
            if False and rank%nenvs == 0 and hvd.local_rank()==0:
                os.makedirs(args.save_path + '/vids/' + args.game, exist_ok=True)
                videofile_prefix = args.save_path + '/vids/' + args.game
                env = VideoWriter(env, videofile_prefix)
            if 'fetch' not in args.game:
                if args.noops:
                    os.makedirs(args.save_path, exist_ok=True)
                    env = NoopResetEnv(env, 30, nenvs, args.save_path, num_per_noop=args.num_per_noop, unlimited_score=args.unlimited_score)
                    env = my_wrapper(env, clip_rewards=True, sticky=args.sticky)
                if args.epsgreedy:
                    env = EpsGreedyEnv(env)
            else:
                os.makedirs(f'{args.save_path}', exist_ok=True)
                env = FetchSaveEnv(env, rank=rank, n_ranks=nenvs, save_path=f'{args.save_path}/', demo_path=args.demo)
                env = TanhWrap(env)
            # def print_rec(e):
            #     print(e.__class__.__name__)
            #     if hasattr(e, 'env'):
            #         print_rec(e.env)
            # import time
            # import random
            # time.sleep(random.random() * 10)
            # print('\tSHOWING STUFF')
            # print_rec(env)
            # print('\n\n\n')
            return env
        return env_fn

    nenvs = args.nenvs
    env = SubprocVecEnv([make_env(i + nenvs * hvd.rank()) for i in range(nenvs)])
    env = VecFrameStack(env, 1 if 'fetch' in args.game else 4)

    if 'fetch' in args.game:
        print('Fetch environment, using the feedforward policy.')
        args.policy = FFPolicy
    else:
        args.policy = {'cnn': CnnPolicy, 'gru': GRUPolicy}[args.policy]

    args.sil_pg_weight_by_value = False
    args.sil_vf_relu = False
    args.sil_vf_coef = 0
    args.sil_coef = 0
    args.sil_ent_coef = 0
    args.ent_coef = 0
    args.vf_coef = 0
    args.cliprange = 1
    args.l2_coef = 0
    args.adam_epsilon = 1e-8
    args.gamma = 0.99
    args.lam = 0.10
    args.scale_rewards = 1.0
    args.sil_weight_success_rate = True
    args.norm_adv = 1.0
    args.log_interval = 1
    args.save_interval = 100
    args.subtract_rew_avg = True
    args.clip_rewards = False
    learn(env, args, True)
Beispiel #28
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          scale=255.0,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          grad_norm_clip=10,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    set_global_seeds(seed)

    input_shape = (1, ) + env.observation_space.shape
    actions = env.action_space.n

    q_model = build_q_model(network,
                            input_shape=input_shape,
                            actions=actions,
                            **network_kwargs)

    actor = Agent(q_model, env.action_space, lr=lr, scale=scale)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model.pth")
        model_saved = False

        if os.path.exists(model_file):
            actor.load(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        now = time.time()
        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                epsilon = exploration.value(t)
                update_param_noise_threshold = 0.
                action = actor.act(obs, epsilon)
            else:
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))

                action = actor.act_with_param_noise(
                    obs,
                    update_param_noise_threshold=update_param_noise_threshold,
                    update_param_noise_scale=True)

            # action = act(np.array(obs), update_eps=update_eps, **kwargs)
            reset = False
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None

                inputs = (obses_t, actions, rewards, obses_tp1, dones, weights)
                td_errors = actor.update(inputs,
                                         double_q=True,
                                         gamma=gamma,
                                         grad_norm_clip=10)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                actor.sync()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("Speed", t / (time.time() - now))
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    actor.save(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            actor.load(model_file)

    return actor
import copy
import sys

# Use algorithms from baselines
from baselines.acktr.acktr_cont import learn
from baselines.acktr.policies import GaussianMlpPolicy
from baselines.acktr.value_functions import NeuralNetValueFunction
from baselines.common import set_global_seeds

env = gym.make('GazeboModularScara3DOF-v3')
initial_observation = env.reset()
print("Initial observation: ", initial_observation)
env.render()

seed=0
set_global_seeds(seed)
env.seed(seed)

with tf.Session(config=tf.ConfigProto()) as session:
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    with tf.variable_scope("vf"):
        vf = NeuralNetValueFunction(ob_dim, ac_dim)
    with tf.variable_scope("pi"):
        policy = GaussianMlpPolicy(ob_dim, ac_dim)

    learn(env,
        policy=policy, vf=vf,
        gamma=0.99,
        lam=0.97,
        timesteps_per_batch=2500,
Beispiel #30
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs
            ):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space
    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))


        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_variables(model_file)

    return act
Beispiel #31
0
def train(env, policy, n_episodes, horizon, seed, njobs=1, **alg_args):

    if env.startswith('rllab.'):
        # Get env name and class
        env_name = re.match('rllab.(\w+)', env).group(1)
        env_rllab_class = rllab_env_from_name(env_name)

        # Define env maker
        def make_env():
            env_rllab = env_rllab_class()
            _env = Rllab2GymWrapper(env_rllab)
            return _env

        # Used later
        env_type = 'rllab'
    else:
        # Normal gym, get if Atari or not.
        env_type = get_env_type(env)
        assert env_type is not None, "Env not recognized."
        # Define the correct env maker
        if env_type == 'atari':
            # Atari, custom env creation
            def make_env():
                _env = make_atari(env)
                return wrap_deepmind(_env)
        else:
            # Not atari, standard env creation
            def make_env():
                env_rllab = gym.make(env)
                return env_rllab

    if policy == 'linear':
        hid_size = num_hid_layers = 0
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3

    if policy == 'linear' or policy == 'nn':

        def make_policy(name, ob_space, ac_space):
            return MlpPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=hid_size,
                num_hid_layers=num_hid_layers,
                gaussian_fixed_var=True,
                use_bias=False,
                use_critic=False,
                hidden_W_init=tf.contrib.layers.xavier_initializer(),
                output_W_init=tf.contrib.layers.xavier_initializer())
    elif policy == 'cnn':

        def make_policy(name, ob_space, ac_space):
            return CnnPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                gaussian_fixed_var=True,
                use_bias=False,
                use_critic=False,
                hidden_W_init=tf.contrib.layers.xavier_initializer(),
                output_W_init=tf.contrib.layers.xavier_initializer())
    else:
        raise Exception('Unrecognized policy type.')

    sampler = ParallelSampler(make_policy,
                              make_env,
                              n_episodes,
                              horizon,
                              True,
                              n_workers=njobs,
                              seed=seed)

    try:
        affinity = len(os.sched_getaffinity(0))
    except:
        affinity = njobs
    sess = U.make_session(affinity)
    sess.__enter__()

    set_global_seeds(seed)

    gym.logger.setLevel(logging.WARN)

    pois.learn(make_env,
               make_policy,
               n_episodes=n_episodes,
               horizon=horizon,
               sampler=sampler,
               **alg_args)

    sampler.close()
Beispiel #32
0
def learn(policy,
          env,
          seed,
          n_steps=20,
          n_stack=4,
          total_timesteps=int(80e6),
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          learning_rate=7e-4,
          lr_schedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=100,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          correction_term=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1):
    """
    Train an ACER model.

    :param policy: (ACERPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param env: (Gym environment) The environment to learn from
    :param seed: (int) The initial seed for training
    :param n_steps: (int) The number of steps to run for each environment
    :param n_stack: (int) The number of stacked frames
    :param total_timesteps: (int) The total number of samples
    :param q_coef: (float) Q function coefficient for the loss calculation
    :param ent_coef: (float) Entropy coefficient for the loss caculation
    :param max_grad_norm: (float) The maximum value for the gradient clipping
    :param learning_rate: (float) The learning rate
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param rprop_epsilon: (float) RMS prop optimizer epsilon
    :param rprop_alpha: (float) RMS prop optimizer decay
    :param gamma: (float) Discount factor
    :param log_interval: (int) The number of timesteps before logging.
    :param buffer_size: (int) The buffer size in number of steps
    :param replay_ratio: (float) The number of replay learning per on policy learning on average,
                                 using a poisson distribution
    :param replay_start: (int) The minimum number of steps in the buffer, before learning replay
    :param correction_term: (float) The correction term for the weights
    :param trust_region: (bool) Enable Trust region policy optimization loss
    :param alpha: (float) The decay rate for the Exponential moving average of the parameters
    :param delta: (float) trust region delta value
    """
    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)

    n_envs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes)  # HACK
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  n_envs=n_envs,
                  n_steps=n_steps,
                  n_stack=n_stack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  q_coef=q_coef,
                  gamma=gamma,
                  max_grad_norm=max_grad_norm,
                  learning_rate=learning_rate,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lr_schedule=lr_schedule,
                  correction_term=correction_term,
                  trust_region=trust_region,
                  alpha=alpha,
                  delta=delta)

    runner = Runner(env=env, model=model, n_steps=n_steps, n_stack=n_stack)
    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        n_steps=n_steps,
                        n_stack=n_stack,
                        size=buffer_size)
    else:
        buffer = None
    n_batch = n_envs * n_steps
    acer = Acer(runner, model, buffer, log_interval)
    acer.t_start = time.time()
    for acer.steps in range(
            0, total_timesteps, n_batch
    ):  # n_batch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            samples_number = np.random.poisson(replay_ratio)
            for _ in range(samples_number):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()
Beispiel #33
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args, normalize_ob=False)
    eval_env = build_env(args, normalize_ob=False, is_eval=True)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)
    beta = -1
    if beta < 0:
        #print(alg_kwargs)
        nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch']
        # Automatically compute beta based on initial entropy and number of iterations
        policy = build_policy(
            env,
            alg_kwargs['network'],
            value_network='copy',
            normalize_observations=alg_kwargs['normalize_observations'],
            copos=True)
        ob = observation_placeholder(env.observation_space)

        sess = U.single_threaded_session()
        sess.__enter__()
        with tf.variable_scope("tmp_pi"):
            tmp_pi = policy(observ_placeholder=ob)
        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob})
        #beta = 2 * entropy / nr_episodes
        beta = 0
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Constantly set beta: " + str(beta))

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    iters = 0
    for model in learn(env=env,
                       env_id=env_id,
                       eval_env=eval_env,
                       make_eval_env=lambda: build_env(
                           args, normalize_ob=False, is_eval=True),
                       seed=seed,
                       beta=beta,
                       total_timesteps=total_timesteps,
                       sil_update=args.sil_update,
                       sil_loss=args.sil_loss,
                       **alg_kwargs):
        if args.store_ckpt:
            save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters))
            model.save(save_path)
            if isinstance(env, VecNormalize):
                rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters))
                with open(rms_path, 'wb') as f:
                    rms = (env.ob_rms, env.ret_rms)
                    pickle.dump(rms, f)
            logger.log('Save {} model'.format(iters + 1))
        iters += 1

    return model, env
Beispiel #34
0
    def learn(self,
              total_timesteps,
              noptepochs=4,
              seed=None,
              log_interval=10,
              save_interval=10):

        set_global_seeds(seed)
        total_timesteps = int(total_timesteps)

        # Calculate the batch_size
        is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

        epinfobuf = deque(maxlen=100)

        # Start total timer
        tfirststart = time.perf_counter()

        for update in range(1, total_timesteps):
            assert self.nbatch % self.nminibatches == 0
            # Start timer
            tstart = time.perf_counter()
            frac = 1.0 - (update - 1.0) / total_timesteps
            # Calculate the learning rate
            lrnow = self.ppo_model.lr(frac)
            # Calculate the cliprange
            cliprangenow = self.ppo_model.cliprange(frac)

            if update % log_interval == 0 and is_mpi_root:
                logger.info('Stepping environment...')

            # Get minibatch
            obs, returns, actions, values, neglogpacs, epinfos = self.ppo_model.runner.run(
            )  # pylint: disable=E0632

            if update % log_interval == 0 and is_mpi_root:
                logger.info('Done.')

            epinfobuf.extend(epinfos)

            # Here what we're going to do is for each minibatch calculate the loss and append it.
            mblossvals = []
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(self.nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, self.nbatch, self.nbatch_train):
                    end = start + self.nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, actions,
                                                      values, neglogpacs))
                    mblossvals.append(
                        self.ppo_model.train(lrnow, cliprangenow, *slices))

            # TODO: recurrent version
            # else:  # recurrent version
            #     assert self.nenvs % self.nminibatches == 0
            #     envsperbatch = self.nenvs // self.nminibatches
            #     envinds = np.arange(self.nenvs)
            #     flatinds = np.arange(self.nenvs * self.nsteps).reshape(self.nenvs, self.nsteps)
            #     for _ in range(self.noptepochs):
            #         np.random.shuffle(envinds)
            #         for start in range(0, self.nenvs, envsperbatch):
            #             end = start + envsperbatch
            #             mbenvinds = envinds[start:end]
            #             mbflatinds = flatinds[mbenvinds].ravel()
            #             slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
            #             mbstates = states[mbenvinds]
            #             mblossvals.append(self.train(lrnow, cliprangenow, *slices, mbstates))

            # Feedforward --> get losses --> update
            lossvals = np.mean(mblossvals, axis=0)
            # End timer
            tnow = time.perf_counter()
            # Calculate the fps (frame per second)
            fps = int(self.nbatch / (tnow - tstart))

            if update % log_interval == 0 or update == 1:
                # Calculates if value function is a good predicator of the returns (ev > 1)
                # or if it's just worse than predicting nothing (ev =< 0)
                ev = explained_variance(values, returns)
                logger.record_tabular("fps", fps)
                logger.record_tabular(
                    'eprewmean',
                    safe_mean([epinfo['r'] for epinfo in epinfobuf]))
                logger.record_tabular(
                    'eplenmean',
                    safe_mean([epinfo['l'] for epinfo in epinfobuf]))
                logger.record_tabular("misc/serial_timesteps",
                                      update * self.nsteps)
                logger.record_tabular("misc/nupdates", update)
                logger.record_tabular("misc/total_timesteps",
                                      update * self.nbatch)
                logger.record_tabular("misc/explained_variance", float(ev))
                logger.record_tabular('misc/time_elapsed', tnow - tfirststart)
                for (lossval, lossname) in zip(lossvals,
                                               self.ppo_model.loss_names):
                    logger.record_tabular('ppo_loss/' + lossname, lossval)

                if is_mpi_root:
                    logger.dump_tabular()

            if save_interval and (update % save_interval == 0
                                  or update == 1) and is_mpi_root:
                file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S',
                                          time.localtime(time.time()))
                model_save_path = self.ppo_model.def_path_pre + file_name
                self.ppo_model.save(model_save_path)

        return self
Beispiel #35
0
def run_hoof_all(
        network,
        env,
        total_timesteps,
        timesteps_per_batch,  # what to train on
        kl_range,
        gamma_range,
        lam_range,  # advantage estimation
        num_kl,
        num_gamma_lam,
        cg_iters=10,
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm
    Parameters:
    ----------
    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets
    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class
    timesteps_per_batch     timesteps per gradient estimation batch
    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )
    ent_coef                coefficient of policy entropy term in the optimization objective
    cg_iters                number of iterations of conjugate gradient algorithm
    cg_damping              conjugate gradient damping
    vf_stepsize             learning rate for adam optimizer used to optimie value function loss
    vf_iters                number of iterations of value function optimization iterations per each policy optimization step
    total_timesteps           max number of timesteps
    max_episodes            max number of episodes
    max_iters               maximum number of policy optimization iterations
    callback                function to be called with (locals(), globals()) each policy optimization step
    load_path               str, path to load the model from (default: None, i.e. no model is loaded)
    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
    Returns:
    -------
    learnt model
    '''

    MPI = None
    nworkers = 1
    rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    # +2 for gamma, lambda
    ob = tf.placeholder(shape=(None, env.observation_space.shape[0] + 2),
                        dtype=env.observation_space.dtype,
                        name='Ob')
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_ratio = U.function(
        [ob, ac, atarg], ratio)  # IS ratio - used for computing IS weights

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator_with_gl(pi,
                                             env,
                                             timesteps_per_batch,
                                             stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    kl_range = np.atleast_1d(kl_range)
    gamma_range = np.atleast_1d(gamma_range)
    lam_range = np.atleast_1d(lam_range)

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()

        thbefore = get_flat()

        rand_gamma = gamma_range[0] + (
            gamma_range[-1] - gamma_range[0]) * np.random.rand(num_gamma_lam)
        rand_lam = lam_range[0] + (
            lam_range[-1] - lam_range[0]) * np.random.rand(num_gamma_lam)
        rand_kl = kl_range[0] + (kl_range[-1] -
                                 kl_range[0]) * np.random.rand(num_kl)

        opt_polval = -10**8
        est_polval = np.zeros((num_gamma_lam, num_kl))
        ob_gam_lam = []
        tdlamret = []
        vpred = []

        for gl in range(num_gamma_lam):
            obgl, vpredbefore, atarg, tdlr = add_vtarg_and_adv_with_gl(
                pi, seg, rand_gamma[gl], rand_lam[gl])
            ob_gam_lam += [obgl]
            tdlamret += [tdlr]
            vpred += [vpredbefore]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            pol_ob = np.concatenate(
                (seg['ob'], np.zeros(seg['ob'].shape[:-1] + (2, ))), axis=-1)
            args = pol_ob, seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=False)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                surrbefore = lossbefore[0]

                for m, kl in enumerate(rand_kl):
                    lm = np.sqrt(shs / kl)
                    fullstep = stepdir / lm
                    thnew = thbefore + fullstep
                    set_from_flat(thnew)

                    # compute the IS estimates
                    lik_ratio = compute_ratio(*args)
                    est_polval[gl, m] = wis_estimate(seg, lik_ratio)

                    # update best policy found so far
                    if est_polval[gl, m] > opt_polval:
                        opt_polval = est_polval[gl, m]
                        opt_th = thnew
                        opt_kl = kl
                        opt_gamma = rand_gamma[gl]
                        opt_lam = rand_lam[gl]
                        opt_vpredbefore = vpredbefore
                        opt_tdlr = tdlr
                        meanlosses = surr, kl, *_ = allmean(
                            np.array(compute_losses(*args)))
                        improve = surr - surrbefore
                        expectedimprove = g.dot(fullstep)
                    set_from_flat(thbefore)
        logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
        set_from_flat(opt_th)

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        ob_gam_lam = np.concatenate(ob_gam_lam, axis=0)
        tdlamret = np.concatenate(tdlamret, axis=0)
        vpred = np.concatenate(vpred, axis=0)
        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (ob_gam_lam, tdlamret),
                        include_final_partial_batch=False,
                        batch_size=num_gamma_lam * 64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpred, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("Opt_KL", opt_kl)
        logger.record_tabular("gamma", opt_gamma)
        logger.record_tabular("lam", opt_lam)

        if rank == 0:
            logger.dump_tabular()

    return pi
Beispiel #36
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)
    logger.log(tf.trainable_variables())

    # Load VGG-m Conv Layer parameters
    load_vggm_conv('checkpoint/ACT1-4.ckpt')

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    # Dylan, for tensorboard
    writer = tf.summary.FileWriter(logger.get_dir(),
                                   tf.get_default_session().graph)
    ep_stats = stats(
        ["Total_timesteps", "EpRewMean", "EpLenMean", "FraRewMean"])

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

        if is_mpi_root:
            EpRewMean = safemean([epinfo['r'] for epinfo in epinfobuf])
            EpLenMean = safemean([epinfo['l'] for epinfo in epinfobuf])
            ep_stats.add_all_summary(
                writer,
                [update * nbatch, EpRewMean, EpLenMean, EpRewMean / EpLenMean],
                update)

    return model
Beispiel #37
0
def learn(
        *,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs):
    #last_perfm=[]
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "Entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)
    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "rms"):
            pi.rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("AverageReturn", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        #if ((total_timesteps % timesteps_per_batch==0) and (total_timesteps//timesteps_per_batch-iters_so_far<50)) or ((total_timesteps % timesteps_per_batch!=0) and (total_timesteps // timesteps_per_batch +1 - iters_so_far < 50)):
        #            last_perfm.append(np.mean(rewbuffer))

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
    '''logger.record_tabular("EpLenMean", np.mean(lenbuffer))
    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
    logger.record_tabular("AverageReturn", np.mean(np.mean(last_perfm)))
    logger.record_tabular("EpThisIter", len(lens))
    logger.record_tabular("EpisodesSoFar", episodes_so_far)
    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
    logger.record_tabular("TimeElapsed", time.time() - tstart)
    if rank==0:
        logger.dump_tabular()'''
    return pi
Beispiel #38
0
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    env.env.assist_timeout = 100.0

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=np.array(
                                                     [0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9,
                                                      10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, \
                                                      28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36,
                                                      37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58,
                                                      57, 59]),
                                                 action_permutation=np.array(
                                                     [-6, 7, -8, 9, 10, 11, -0.001, 1, -2, 3, 4, 5, -12, 13, -14, -19,
                                                      20, -21, 22, -15, 16, -17, 18]))

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    previous_params = None
    iter_num = 0
    last_iter = False

    # if initialize from previous runs
    #previous_params = joblib.load('')
    #env.env.env.assist_schedule = []

    joblib.dump(str(env.env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)

    reward_threshold = None
    while True:
        if not last_iter:
            rollout_length_thershold = env.env.env.assist_schedule[2][
                0] / env.env.env.dt
        else:
            rollout_length_thershold = None
        opt_pi, rew = pposgd_mirror.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=int(2500),
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            sym_loss_weight=4.0,
            positive_rew_enforce=False,
            init_policy_params=previous_params,
            reward_drop_bound=500,
            rollout_length_thershold=rollout_length_thershold,
            policy_scope='pi' + str(iter_num),
            return_threshold=reward_threshold,
        )
        if iter_num == 0:
            reward_threshold = 0.7 * rew
        if last_iter:
            reward_threshold = None
        iter_num += 1

        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        # update the assist schedule
        for s in range(len(env.env.env.assist_schedule) - 1):
            env.env.env.assist_schedule[s][1] = np.copy(
                env.env.env.assist_schedule[s + 1][1])
        env.env.env.assist_schedule[-1][1][0] *= 0.75
        env.env.env.assist_schedule[-1][1][1] *= 0.75
        if env.env.env.assist_schedule[-1][1][0] < 5.0:
            env.env.env.assist_schedule[-1][1][0] = 0.0
        if env.env.env.assist_schedule[-1][1][1] < 5.0:
            env.env.env.assist_schedule[-1][1][1] = 0.0
        zero_assist = True
        for s in range(len(env.env.env.assist_schedule) - 1):
            for v in env.env.env.assist_schedule[s][1]:
                if v != 0.0:
                    zero_assist = False
        print('Current Schedule: ', env.env.env.assist_schedule)
        if zero_assist:
            last_iter = True
            print('Entering Last Iteration!')

    env.close()
Beispiel #39
0
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):

    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model
Beispiel #40
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        save_interval=100,
        load_path=None,
        **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    # print(env.action_space.shape)
    # print(env.action_space.n)
    # nb_actions = env.action_space.shape[-1]
    continuous_ctrl = not isinstance(env.action_space, spaces.Discrete)
    nb_actions = env.action_space.shape[
        -1] if continuous_ctrl else env.action_space.n
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    action_shape = env.action_space.shape if continuous_ctrl else (
        nb_actions, )

    # print(env.action_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    max_action = 1
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)

    if load_path is not None:
        load_path = "{}/{}/checkpoints/checkpoints-final".format(
            Config.results_dir, load_path)
        load_path = osp.expanduser(load_path)
        agent.load(load_path)

    sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = 0.0
    epoch_episode_steps = 0.0
    epoch_actions = 0.0
    epoch_qs = 0.0
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                # print(action)
                # print(np.argmax(max_action * action, axis=1))

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                action_step = action if continuous_ctrl else np.argmax(
                    max_action * action, axis=1)
                new_obs, r, done, info = env.step(
                    action_step
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                # print(new_obs)
                # print(r)

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                # epoch_actions.append(action_step)
                epoch_actions += sum(action_step)
                epoch_qs += q
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards += episode_reward[d]
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps += episode_step[d]
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_action_step = eval_action if continuous_ctrl else np.argmax(
                        max_action * eval_action, axis=1)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action_step
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats[
            Config.tensorboard_rootdir +
            'rollout/return'] = epoch_episode_rewards / float(episodes)
        combined_stats[Config.tensorboard_rootdir +
                       'rollout/return_history'] = np.mean(
                           episode_rewards_history)
        combined_stats[
            Config.tensorboard_rootdir +
            'rollout/episode_steps'] = epoch_episode_steps / float(episodes)
        combined_stats[Config.tensorboard_rootdir +
                       'rollout/actions_mean'] = epoch_actions / float(t)
        combined_stats[Config.tensorboard_rootdir +
                       'rollout/Q_mean'] = epoch_qs / float(t)
        combined_stats[Config.tensorboard_rootdir +
                       'train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats[Config.tensorboard_rootdir +
                       'train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats[Config.tensorboard_rootdir +
                       'train/param_noise_distance'] = np.mean(
                           epoch_adaptive_distances)
        combined_stats[Config.tensorboard_rootdir +
                       'total/duration'] = duration
        combined_stats[Config.tensorboard_rootdir +
                       'total/steps_per_second'] = float(t) / float(duration)
        combined_stats[Config.tensorboard_rootdir +
                       'total/episodes'] = episodes
        combined_stats[Config.tensorboard_rootdir +
                       'rollout/episodes'] = epoch_episodes
        # combined_stats[Config.tensorboard_rootdir+'rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats[Config.tensorboard_rootdir +
                           'eval/return'] = eval_episode_rewards
            combined_stats[Config.tensorboard_rootdir +
                           'eval/return_history'] = np.mean(
                               eval_episode_rewards_history)
            combined_stats[Config.tensorboard_rootdir + 'eval/Q'] = eval_qs
            combined_stats[Config.tensorboard_rootdir +
                           'eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats[Config.tensorboard_rootdir + 'total/epochs'] = epoch + 1
        combined_stats[Config.tensorboard_rootdir + 'total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        # logdir = logger.get_dir()
        # if rank == 0 and logdir:
        #     if hasattr(env, 'get_state'):
        #         with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
        #             pickle.dump(env.get_state(), f)
        #     if eval_env and hasattr(eval_env, 'get_state'):
        #         with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
        #             pickle.dump(eval_env.get_state(), f)

        if save_interval and (epoch % save_interval == 0
                              or epoch == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % epoch)
            print('Saving to', savepath)
            agent.save(savepath)

    return agent
Beispiel #41
0
    logdir = '/tmp/ros2learn/' + defaults['env_name'] + '/ppo2_lstm_results/'
finally:
    logger.configure( os.path.abspath(logdir) )
    csvdir = logdir + "/csv/"

csv_files = [csvdir + "det_obs.csv", csvdir + "det_acs.csv", csvdir + "det_rew.csv" ]
if not os.path.exists(csvdir):
    os.makedirs(csvdir)
else:
    for f in csv_files:
        if os.path.isfile(f):
            os.remove(f)

env = DummyVecEnv([make_env])

set_global_seeds(defaults['seed'])

if isinstance(defaults['lr'], float):
    defaults['lr'] = constfn(defaults['lr'])
else:
    assert callable(defaults['lr'])
if isinstance(defaults['cliprange'], float):
    defaults['cliprange'] = constfn(defaults['cliprange'])
else:
    assert callable(defaults['cliprange'])

alg_kwargs ={ 'nlstm': defaults['nlstm'], 'layer_norm': defaults['layer_norm'] }
policy = build_policy(env, defaults['network'], **alg_kwargs)

nenvs = env.num_envs
ob_space = env.observation_space
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None,
          begin_iter=40, model_train=4, random_initial_ratio=0, exponent=1.0,
          K=0, beta=1, alpha=0.1, index_type='min', reward_freq=40, r_ex_coef=1,
          **network_kwargs):


    ''' # K=0: Original PPO; K=1: Single Surprise # index_type= 'min', 'max', 'ens', 'avg'
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    np.set_printoptions(precision=3)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = network

    dynamics = build_dynamics(env, 'mlp', **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
    model = make_model()
    if load_path is not None:
        model.load(load_path)

    # Instantiate the dynamics class object
    dynamics_class = Dynamics(dynamics=dynamics, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps,K=K)

    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, dynamics=dynamics_class,
                    index_type=index_type, ex_coef=r_ex_coef, beta=beta, reward_freq=reward_freq)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam, dynamics=dynamics_class,
                             index_type=index_type, ex_coef=r_ex_coef, beta=beta, reward_freq=reward_freq)


    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    for _ in range(begin_iter):
        runner.run_begin()


    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps//nbatch

    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0

        logger.log("********** Iteration %i ************" % update)

        ### random_initial_ratio = 0 -> no random selection
        ### 0 <= random_initial_ratio < 1
        if update >= int(random_initial_ratio*nupdates):
            runner.flag = False


        ### Store old parameters of every model
        runner.old_weights = runner.weights

        ### Store old parameters of every model; and store new value to old one; For later calculation
        model_old_weights_list = []
        for index in range(K):
            param_temp = dynamics_class.MDP_get_flat[index]()
            model_old_weights_list.append(param_temp)

        ### Store new parameters of every model
        model_new_weights_list = []

        ### Store training sets for test
        #dyn_batch_list = []

        for index in range(K): ## We do not have to ramdomly order
            dyn_batch = runner.replay_memory.sample(batch_size=nbatch) ## Every model should train with different batches
            #dyn_batch_list.append(dyn_batch)
            dynamics_inds = np.arange(nbatch)
            for _ in range(model_train):
                # Randomize the indexes
                np.random.shuffle(dynamics_inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train): ## nabtch_train = 64
                    end = start + nbatch_train
                    dynamics_mbinds = dynamics_inds[start:end]
                    dynamics_slices = (arrr[dynamics_mbinds] for arrr in (dyn_batch['obs0'], dyn_batch['actions'], dyn_batch['obs1']))
                    dynamics_class.dynamics_train(index, *dynamics_slices, runner.weights, alpha)
            Param_Temp = dynamics_class.MDP_get_flat[index]()
            model_new_weights_list.append(Param_Temp)
            dynamics_class.MDP_set_from_flat[index](model_old_weights_list[index])


        if K >= 2:  # K = 2, 3, 4, ...
            q_batch = runner.replay_memory.sample(batch_size=nbatch)
            dynamics_inds = np.arange(nbatch)
            for _ in range(model_train):
                # Randomize the indexes
                np.random.shuffle(dynamics_inds)
                # 0 to batch_size with batch_train_size step
                for start2 in range(0, nbatch, nbatch_train):  ## nabtch_train = 64
                    end2 = start2 + nbatch_train
                    dynamics_mbinds = dynamics_inds[start2:end2]
                    q_slices = (arrry[dynamics_mbinds] for arrry in
                                (q_batch['obs0'], q_batch['actions'], q_batch['obs1']))
                    runner.weights = dynamics_class.weight_train(runner.weights, K, *q_slices, nbatch_train,
                                                                 update, nupdates, exponent)

        ### Setting new parameters simultaneously
        for index in range(K):
            dynamics_class.MDP_set_from_flat[index](model_new_weights_list[index])
            dynamics_class.old_MDP_set_from_flat[index](model_old_weights_list[index]) ### To make actual 1-step surprise

        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    # print("slices", slices)
                    # print("*slices", *slices)
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            #print("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)

    return model
Beispiel #43
0
def train_mirror(args, num_timesteps):
    from baselines.ppo1 import mlp_mirror_policy, mlp_mirror_norms_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    env.env._seed(args.seed + MPI.COMM_WORLD.Get_rank())
    env.env.init_params(args)

    U.ALREADY_INITIALIZED = set()
    U.ALREADY_INITIALIZED.update(set(tf.global_variables()))

    obs_per = np.array([
        0.0001, -1, 2, -3, -4, 11, 12, 13, 14, 15, 16, 5, 6, 7, 8, 9, 10, -17,
        18, -19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, -30, 31, -32, -33, 40,
        41, 42, 43, 44, 45, 34, 35, 36, 37, 38, 39, -46, 47, -48, 53, 54, 55,
        56, 49, 50, 51, 52
    ])

    if env.env.include_additional_info:
        obs_per = np.concatenate((obs_per, np.array([58, 57])))
        obs_per = np.concatenate((obs_per, np.array([59])))
        obs_per = np.concatenate((obs_per, np.array([63, 64, -65, 60, 61,
                                                     -62])))
        obs_per = np.concatenate((obs_per, np.array([66, 67, -68])))
        obs_per = np.concatenate((obs_per, np.array([72, 73, -74, 69, 70,
                                                     -71])))
        obs_per = np.concatenate((obs_per, np.array([75, 76, -77])))
        obs_per = np.concatenate((obs_per, np.array([78, 79, -80])))
        assert env.env.obs_dim == (57 + 3 + 3 * 6 + 3)
        assert env.env.act_dim == 21  # change action/state permutation if change action/state in env

    def policy_fn(name, ob_space, ac_space):
        if env.env.env.state_self_standardize:
            return mlp_mirror_norms_policy.MlpMirrorNormsPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=args.hsize,
                num_hid_layers=args.layers,
                gmm_comp=1,
                mirror_loss=True,
                observation_permutation=obs_per,
                action_permutation=np.array([
                    5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18,
                    19, 20, 13, 14, 15, 16
                ]))
        else:
            return mlp_mirror_policy.MlpMirrorPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=args.hsize,
                num_hid_layers=args.layers,
                gmm_comp=1,
                mirror_loss=True,
                observation_permutation=obs_per,
                action_permutation=np.array([
                    5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18,
                    19, 20, 13, 14, 15, 16
                ]))

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    joblib.dump(str(env.env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)
    with open(logger.get_dir() + '/env_specs.txt', 'w') as f:
        pprint.pprint(env.env.env.__dict__, f)
    f.close()
    shutil.copyfile(env.env.env.model_file_name,
                    logger.get_dir() + '/using_model.skel')

    cur_sym_loss = 1.0
    iter_num = 0
    previous_params = None
    # previous_params = joblib.load('')
    reward_threshold = None
    rollout_length_threshold = None
    pposgd_mirror.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=int(2000),
        clip_param=args.clip,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        callback=callback,
        sym_loss_weight=cur_sym_loss,
        init_policy_params=previous_params,
        reward_drop_bound=None,
        rollout_length_threshold=rollout_length_threshold,
        policy_scope='pi' + str(iter_num),
        return_threshold=reward_threshold,
    )

    env.close()
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartWalker3d-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument(
        '--init_policy',
        help='Initial Policy',
        default=
        'data/ppo_DartWalker3d-v1303_energy04_vel15_mirror4_velrew3_asinput_curriculum/policy_params.pkl'
    )
    parser.add_argument('--init_curriculum',
                        help='Initial Curriculum',
                        nargs='+',
                        default=[2000.0, 1000])
    parser.add_argument(
        '--ref_policy',
        help='Reference Policy',
        default=
        'data/ppo_DartWalker3d-v1303_energy04_vel15_mirror4_velrew3_asinput_curriculum/policy_params.pkl'
    )
    parser.add_argument('--ref_curriculum',
                        help='Reference Curriculum',
                        nargs='+',
                        default=[2000.0, 1000])
    parser.add_argument('--anc_thres',
                        help='Anchor Threshold',
                        type=float,
                        default=0.85)
    parser.add_argument('--prog_thres',
                        help='Progress Threshold',
                        type=float,
                        default=0.6)
    parser.add_argument('--batch_size',
                        help='Batch Size',
                        type=int,
                        default=2500)
    parser.add_argument('--max_iter',
                        help='Maximum Iteration',
                        type=int,
                        default=2000)
    parser.add_argument('--use_reftraj',
                        help='Use reference trajectory',
                        type=int,
                        default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_curriculum_150eachit_vel15_runningavg3_e04_' +
                     args.env + '_' + str(args.seed) + '_' +
                     str(args.anc_thres) + '_' + str(args.prog_thres) + '_' +
                     str(args.batch_size))

    sess = U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    ob_space = env.observation_space
    ac_space = env.action_space

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            gmm_comp=1,
            mirror_loss=True,
            observation_permutation=np.array([
                0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8,
                -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28,
                35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41,
                43
            ]),
            action_permutation=np.array([
                -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8
            ]))

    policy = policy_fn('policy', ob_space, ac_space)
    init_curriculum = np.array(args.init_curriculum)
    ref_policy = policy_fn('ref_policy', ob_space, ac_space)
    ref_curriculum = np.array(args.ref_curriculum)

    policy_params = joblib.load(args.init_policy)
    ref_policy_params = joblib.load(args.ref_policy)
    U.initialize()
    cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].
                                               name.find('/')]
    orig_scope = list(
        policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')]
    ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params.
                                                         keys())[0].find('/')]
    for i in range(len(policy.get_variables())):
        assign_op = policy.get_variables()[i].assign(
            policy_params[policy.get_variables()[i].name.replace(
                cur_scope, orig_scope, 1)])
        sess.run(assign_op)
        assign_op = ref_policy.get_variables()[i].assign(
            ref_policy_params[ref_policy.get_variables()[i].name.replace(
                'ref_' + cur_scope, ref_scope, 1)])
        sess.run(assign_op)

    anchor_threshold = args.anc_thres
    progress_threshold = args.prog_thres

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    curriculum_evolution = []

    env.env.env.anchor_kp = ref_curriculum
    ref_score = None
    ref_max_score = None
    reference_trajectory = None
    if MPI.COMM_WORLD.Get_rank() == 0:
        if args.use_reftraj == 1:
            reference_trajecotry = gen_reftraj(env, ref_policy, 299)
            env.env.reference_trajectory = reference_trajectory
        ref_score, ref_max_score = evaluate_policy(env, ref_policy, 20)
    ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0)
    ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0)
    reference_score = ref_score * progress_threshold
    reference_anchor_score = ref_score * anchor_threshold
    reference_max_score = ref_max_score * 0.9
    env.env.env.anchor_kp = init_curriculum
    reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0)
    env.env.reference_trajectory = reference_trajectory

    current_curriculum = np.copy(init_curriculum)
    print('reference scores: ', reference_score, reference_anchor_score,
          reference_max_score)

    previous_params = policy_params
    for iter in range(args.max_iter):
        print('curriculum iter ', iter)
        print('ref score: ', reference_anchor_score)

        opt_pi, final_rew = pposgd_mirror.learn(
            env,
            policy_fn,
            max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150,
            timesteps_per_batch=int(args.batch_size),
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            sym_loss_weight=2.0,
            return_threshold=reference_anchor_score,
            init_policy_params=previous_params,
            policy_scope='pi' + str(iter),
            min_iters=0,
            reward_drop_bound=True,
            #max_threshold = reference_max_score,
        )
        print('one learning iteration done')
        if np.linalg.norm(current_curriculum) >= 0.0001:
            # re-compute reference trajectory
            if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1:
                print('recompute reference traj')
                reference_trajecotry = gen_reftraj(env, opt_pi, 299)
            reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory,
                                                        root=0)
            env.env.reference_trajectory = reference_trajectory

            if final_rew < reference_anchor_score * 0.95:
                print('update reference scores')
                reference_score = reference_score / reference_anchor_score * final_rew
                reference_anchor_score = final_rew

            closest_candidate = None
            if MPI.COMM_WORLD.Get_rank() == 0:
                directions = [
                    np.array([-1, 0]),
                    np.array([0, -1]),
                    -current_curriculum / np.linalg.norm(current_curriculum)
                ]
                int_d1 = directions[0] + directions[2]
                int_d2 = directions[1] + directions[2]
                directions.append(int_d1 / np.linalg.norm(int_d1))
                directions.append(int_d2 / np.linalg.norm(int_d2))

                #directions = [np.array([0.0, -1.0])] # only search in one direction
                candidate_next_anchors = []
                for direction in directions:
                    found_point, perf = binary_search_curriculum(
                        env, opt_pi, current_curriculum, direction,
                        reference_score, reference_max_score, 6)
                    print(direction, found_point, perf)
                    candidate_next_anchors.append(found_point)
                    if closest_candidate is None:
                        closest_candidate = np.copy(found_point)
                    elif np.linalg.norm(closest_candidate) > np.linalg.norm(
                            found_point):
                        closest_candidate = np.copy(found_point)
                if np.linalg.norm(closest_candidate) < 0.5:
                    closest_candidate = np.array([0, 0])
                if np.abs(closest_candidate[0]) < 0.5:
                    closest_candidate[0] = 0.0
                if np.abs(closest_candidate[1]) < 0.5:
                    closest_candidate[1] = 0.0
            closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0)

            current_curriculum = np.copy(closest_candidate)
        env.env.env.anchor_kp = current_curriculum
        '''print('Update Init Pose Distributions')
        update_init_poses(env, opt_pi)
        if MPI.COMM_WORLD.Get_rank() == 0:
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True)
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)'''

        curriculum_evolution.append(current_curriculum)
        print('Current curriculum: ', current_curriculum)
        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        if np.linalg.norm(current_curriculum) < 0.0001:
            if reference_anchor_score < ref_score:
                reference_anchor_score = ref_score
            else:
                break

    env.close()
Beispiel #45
0
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Beispiel #46
0
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs,
           structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        save_dir = find_save_path('./save/' + env + "/", trial_id)
        logger.configure(dir=save_dir)
    else:
        save_dir = None

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params, add main function arguments and log all parameters
    if structure == 'curious' or structure == 'task_experts':
        params = config.MULTI_TASK_PARAMS
    else:
        params = config.DEFAULT_PARAMS

    time = str(datetime.datetime.now())
    params['time'] = time
    params['env_name'] = env
    params['task_selection'] = task_selection
    params['goal_selection'] = goal_selection
    params['task_replay'] = task_replay
    params['goal_replay'] = goal_replay
    params['structure'] = structure
    params['normalize_obs'] = normalize_obs
    params['num_cpu'] = num_cpu
    params['clip_return'] = clip_return
    params['trial_id'] = trial_id
    params['seed'] = seed
    if rank == 0:
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(params, f)
    params = config.prepare_params(params)
    params['ddpg_params']['normalize_obs'] = normalize_obs
    if rank == 0:
        config.log_params(params, logger=logger)

    if num_cpu != 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)

    buffers = config.configure_buffer(dims=dims, params=params)

    # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy
    if structure == 'task_experts':
        policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])]
    else:
        policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return)


    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'structure': structure,
        'task_selection': task_selection,
        'goal_selection': goal_selection,
        'queue_length': params['queue_length'],
        'eval': False,
        'eps_task': params['eps_task']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'structure' : structure,
        'task_selection': task_selection,
        'goal_selection' : goal_selection,
        'queue_length': params['queue_length'],
        'eval': True,
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    if structure == 'task_experts':
        # create one rollout worker per policy/task
        rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])]
        for i in range(params['nb_tasks']):
            rollout_worker[i].seed(rank_seed + i)
    else:
        rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed + 100)

    train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb,
          policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
Beispiel #47
0
def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders,
          seed):
    from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection, mlp_policy_mirror_novelty, \
        pposgd_mirror_novelty, pposgd_mirror_novelty_projection

    rank = MPI.COMM_WORLD.Get_rank()

    workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if seed is not None else None
    set_global_seeds(workerseed)

    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  # pylint: disable=W0613
        return mlp_policy_mirror_novelty.MlpPolicyMirrorNovelty(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            mirror_loss=True,
            observation_permutation=[
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 11, 12, 13,
                14, 23, 24, 25, 26, 19, 20, 21, 22
            ],
            action_permutation=[4, 5, 6, 7, 0, 1, 2, 3])

    env.env.novel_autoencoders = autoencoders

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), str(rank)))

    env.seed(seed + rank)
    if len(autoencoders) == 0:
        print("NO AUTOENCODER!!")
        model = pposgd_novelty.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            # max_iters=30,
            timesteps_per_actorbatch=timesteps_per_actor,
            clip_param=0.2,
            entcoeff=0,
            optim_epochs=3,
            optim_stepsize=1e-3,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            # Following params are kwargs
            session=sess,
            gap=5,
            sym_loss_weight=1,
        )
    else:
        print("AUTOENCODER LENGTH: ", len(autoencoders))

        model = pposgd_novelty_projection.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            # max_iters=30,
            timesteps_per_actorbatch=timesteps_per_actor,
            clip_param=0.2,
            entcoeff=0,
            optim_epochs=3,
            optim_stepsize=1e-3,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            # Following params are kwargs
            session=sess,
            gap=5,
            sym_loss_weight=1,
        )
    env.close()
    return model
Beispiel #48
0
def main():
    args = setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    seed = int(time.time()) % 10000
    utils.mpi_print(seed * 100 + rank)
    set_global_seeds(seed * 100 + rank)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    config['global_seed'] = seed
    wandb.init(name=config["run_id"],
               project="coinrun",
               notes=" GARL generate seed",
               tags=["try"],
               config=config)

    utils.setup_mpi_gpus()
    utils.mpi_print('Set up gpu', args)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    eval_limit = Config.EVAL_STEP * 10**6
    phase_eval_limit = int(eval_limit // Config.TRAIN_ITER)
    total_timesteps = int(Config.TOTAL_STEP * 10**6)
    phase_timesteps = int((total_timesteps - eval_limit) // Config.TRAIN_ITER)

    with tf.Session(config=config):
        sess = tf.get_default_session()

        # init env
        nenv = Config.NUM_ENVS
        env = make_general_env(nenv, rand_seed=seed)
        utils.mpi_print('Set up env')

        policy = policies_back.get_policy()
        utils.mpi_print('Set up policy')

        optimizer = SeedOptimizer(env=env,
                                  logdir=Config.LOGDIR,
                                  spare_size=Config.SPA_LEVELS,
                                  ini_size=Config.INI_LEVELS,
                                  eval_limit=phase_eval_limit,
                                  train_set_limit=Config.NUM_LEVELS,
                                  load_seed=Config.LOAD_SEED,
                                  rand_seed=seed,
                                  rep=1,
                                  log=True)

        step_elapsed = 0
        t = 0

        if args.restore_id is not None:
            datapoints = Config.get_load_data('default')['datapoints']
            step_elapsed = datapoints[-1][0]
            optimizer.load()
            seed = optimizer.hist[-1]
            env.set_seed(seed)
            t = 16
            print('loadrestore')
            Config.RESTORE_ID = Config.get_load_data(
                'default')['args']['run_id']
            Config.RUN_ID = Config.get_load_data(
                'default')['args']['run_id'].replace('-', '_')

        while (step_elapsed < (Config.TOTAL_STEP - 1) * 10**6):
            # ============ GARL =================
            # optimize policy
            mean_rewards, datapoints = learn_func(
                sess=sess,
                policy=policy,
                env=env,
                log_interval=args.log_interval,
                save_interval=args.save_interval,
                nsteps=Config.NUM_STEPS,
                nminibatches=Config.NUM_MINIBATCHES,
                lam=Config.GAE_LAMBDA,
                gamma=Config.GAMMA,
                noptepochs=Config.PPO_EPOCHS,
                ent_coef=Config.ENTROPY_COEFF,
                vf_coef=Config.VF_COEFF,
                max_grad_norm=Config.MAX_GRAD_NORM,
                lr=lambda f: f * Config.LEARNING_RATE,
                cliprange=lambda f: f * Config.CLIP_RANGE,
                start_timesteps=step_elapsed,
                total_timesteps=phase_timesteps,
                index=t)

            # test catestrophic forgetting
            if 'Forget' in Config.RUN_ID:
                last_set = list(env.get_seed_set())
                if t > 0:
                    curr_set = list(env.get_seed_set())
                    last_scores, _ = eval_test(sess,
                                               nenv,
                                               last_set,
                                               train=True,
                                               idx=None,
                                               rep_count=len(last_set))
                    curr_scores, _ = eval_test(sess,
                                               nenv,
                                               curr_set,
                                               train=True,
                                               idx=None,
                                               rep_count=len(curr_set))
                    tmp = set(curr_set).difference(set(last_set))
                    mpi_print("Forgetting Exp")
                    mpi_print("Last setsize", len(last_set))
                    mpi_print("Last scores", np.mean(last_scores),
                              "Curr scores", np.mean(curr_scores))
                    mpi_print("Replace count", len(tmp))

            # optimize env
            step_elapsed = datapoints[-1][0]
            if t < Config.TRAIN_ITER:
                best_rew_mean = max(mean_rewards)
                env, step_elapsed = optimizer.run(sess, env, step_elapsed,
                                                  best_rew_mean)
            t += 1

        save_final_test = True
        if save_final_test:
            final_test = {}
            final_test['step_elapsed'] = step_elapsed
            train_set = env.get_seed()
            final_test['train_set_size'] = len(train_set)
            eval_log = eval_test(sess,
                                 nenv,
                                 train_set,
                                 train=True,
                                 is_high=False,
                                 rep_count=1000,
                                 log=True)
            final_test['Train_set'] = eval_log

            eval_log = final_test(sess,
                                  nenv,
                                  None,
                                  train=False,
                                  is_high=True,
                                  rep_count=1000,
                                  log=True)
            final_test['Test_set'] = eval_log
            joblib.dump(final_test, setup_utils.file_to_path('final_test'))

    env.close()
Beispiel #49
0
def learn(network, env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=20,
          nb_rollout_steps=100,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='adaptive-param_0.2',
          normalize_returns=False,
          normalize_observations=True,
          actor_l2_reg=0.0,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=50, # per epoch cycle and MPI worker,
          nb_eval_steps=1000,
          batch_size=64, # per MPI worker
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          rb_size=1e6,
          save_interval=5,  # reduce memory footprint
          bc_epochs=0,
          load_path=None,
          demos_path=None,
          bc_teacher_lambda=0.0,
          use_qfilter=False,
          **network_kwargs):
    """Learns policy using imitation (maybe DAgger) w/vectorized environments.

    If we pass other arguments that aren't specified here, they are considered
    as network_kwargs.

    Parameters
    ----------
    noise_type: for noise to be added to the behavior policy. They are NOT
        using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I
        _think_ that if one does the OU process, we get action noise, but not
        parameter noise. Also, be sure to use `name_stdev` in that convention,
        as the code will split the argument at the underscores.

    actor_lr: 1e-4  (matches paper)

    critic_lr: 1e-3  (matches paper)

    critic_l2: 1e-2  (matches paper)

    gamma: 0.99  (matches paper)

    batch_size: 64  (matches paper for lower-dim env obs/states)

    tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001.

    nb_epoch_cycles: number of times we go through this cycle of: (1) get
        rollouts with noise added to policy and apply to replay buffer, (2)
        gradient updates for actor/critic, (3) evaluation rollouts (if any).
        AFTER all of these cycles happen, THEN we log statistics.

    nb_rollout_steps: number of steps in each parallel env we take with
        exploration policy without training, so this is just to populate the
        replay buffer. More parallel envs *should* mean that we get more
        samples in the buffer between each gradient updates of the network, so
        this might need to be environment *and* machine (# of CPUs) specific.

    nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do
        this many updates; each involves sampling from the replay buffer and
        updating the actor and critic (via lagged target updates).

    nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures
        that fixed length envs like Ant-v2 can get one full episode (assuming
        one parallel env) during evaluation stagtes.

    eval_env: A separate environment for evaluation only, where no noise is
        applied, similar to how rlkit does it.

    save_interval: Frequency between saving.
    """
    set_global_seeds(seed)

    # Daniel: NOTE/TODO testing, if I can.
    USE_KERAS = False
    # Daniel: should be False unless I'm doing some testing.
    do_valid_tests = False

    # Daniel: this helps to maintain compatibility with PPO2 code. For now
    # we're ignoring it, but we should check that we're always clipping. I
    # changed the nb_epochs to match with PPO2 in that we divide by nenvs.
    if 'limit_act_range' in network_kwargs:
        network_kwargs.pop('limit_act_range')
    nenvs = env.num_envs

    nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps
    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // nbatchsize
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    # we assume symmetric actions.
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()

    # Form XP (1M steps, same as in paper), and ONLY ACTOR net, no critic.
    # Daniel: force dtype here so we can use uint8 type images.
    assert env.observation_space.low.dtype == env.observation_space.high.dtype
    # Also changing to (100,100) unless we do keras/pretraining, to force smaller images.
    if USE_KERAS:
        obs_shape = env.observation_space.shape
    else:
        obs_shape = (100,100,4)

    memory = Memory(limit=int(rb_size),
                    action_shape=env.action_space.shape,
                    observation_shape=obs_shape,
                    dtype=env.observation_space.low.dtype,
                    do_valid=do_valid_tests)
    actor = Actor(nb_actions, network=network, use_keras=USE_KERAS, **network_kwargs)

    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))

    # The `learn` defaults above have priority over defaults in IMIT class.
    agent = IMIT(actor=actor,
                 memory=memory,
                 observation_shape=obs_shape,
                 action_shape=env.action_space.shape,
                 batch_size=batch_size,
                 actor_l2_reg=actor_l2_reg,
                 actor_lr=actor_lr,
                 use_keras=USE_KERAS)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Prepare everything.
    sess = U.get_session()
    agent.initialize(sess)
    # --------------------------------------------------------------------------
    # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to
    # be initialized explicitly above. Must call after `agent.load` gets
    # created.  Not sure if this works with parameter space noise or with
    # normalization, but I don't plan to resume training (for now). It also has
    # to be *before* the `graph.finalize()` because otherwise we get an error.
    # --------------------------------------------------------------------------
    if load_path is not None:
        logger.info("\nInside IMIT, loading model from: {}".format(load_path))
        agent.load(load_path)
    # --------------------------------------------------------------------------
    sess.graph.finalize()

    # --------------------------------------------------------------------------
    # Daniel: populate replay buffer, followed by behavior cloning stage.
    # But if load_path is not None, then doesn't make sense -- we want to load.
    # We also don't need to do this if timesteps is 0 (e.g., for playing policy).
    # --------------------------------------------------------------------------
    # OK now we're assuming we do DAgger by default. If we don't want to do
    # DAgger (i.e., pure BC) then let's set time_steps=1 for training.
    # --------------------------------------------------------------------------
    if total_timesteps == 0:
        return agent
    assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed)

    if (demos_path is not None and load_path is None):
        _ddpg_demos(demos_path, agent, memory, algo='IMIT')
        assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries
        if do_valid_tests:
            memory.set_valid_idx()
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        statsdir = osp.join(logger.get_dir(), 'pretrain_stats')
        os.makedirs(checkdir, exist_ok=True)
        os.makedirs(statsdir, exist_ok=True)

        # Pretrain, based on their training code for some # of minibatches.
        pt_actor_losses = []
        pt_actor_losses_l2 = []
        batches_per_ep = int(memory.nb_entries / batch_size)
        logger.info('Running BC for {} epochs'.format(bc_epochs))
        logger.info('  data size in memory: {}'.format(memory.nb_entries))
        logger.info('  each batch: {}, epoch mbs: {}'.format(batch_size, batches_per_ep))
        if do_valid_tests:
            logger.info('  memory valid idx: {}'.format(memory.nb_valid_items))
        pt_start = time.time()

        for epoch in range(1,bc_epochs+1):
            losses = [] # includes L2 fyi
            losses_l2 = []
            for _ in range(batches_per_ep):
                al, al_l2 = agent.train(return_l2=True)
                losses.append(al)
                losses_l2.append(al_l2)
            pt_actor_losses.append( np.mean(losses) )
            pt_actor_losses_l2.append( np.mean(losses_l2) )

            # Check and save model occasionally.
            if epoch == 1 or epoch % 10 == 0:
                pt_time = (time.time() - pt_start) / 60.
                logger.info('  epoch done: {}, loss over past epoch: {:.4f} (l2: {:.4f})'.format(
                        str(epoch).zfill(4), pt_actor_losses[-1], pt_actor_losses_l2[-1]))
                logger.info('  elapsed time: {:.1f}m'.format(pt_time))
                savepath = osp.join(checkdir, 'bc_epoch_{}'.format(str(epoch).zfill(4)))
                logger.info('Saving model checkpoint to: ', savepath)
                agent.save(savepath)

            # Do validation here.
            if do_valid_tests:
                num_mbs = int(memory.nb_valid_items / batch_size)
                l2_errors = []
                for mb in range(num_mbs):
                    res = memory.get_valid_obs(mb*batch_size, (mb+1)*batch_size)
                    valid_obs = res['obs0']
                    valid_act = res['actions']
                    assert valid_obs.shape == (batch_size,100,100,3), valid_obs.shape
                    acts, _, _, _ = agent.step(obs=valid_obs, apply_noise=False)
                    l2_err_vector = np.mean((valid_act - acts)**2, axis=1)
                    l2_errors.extend(l2_err_vector)

                # Last minibatch
                res = memory.get_valid_obs((mb+1)*batch_size, memory.nb_valid_items)
                valid_obs = res['obs0']
                valid_act = res['actions']
                acts, _, _, _ = agent.step(obs=valid_obs, apply_noise=False)
                l2_err_vector = np.mean((valid_act - acts)**2, axis=1)
                l2_errors.extend(l2_err_vector)

                l2_err_valid = np.mean(l2_errors)
                logger.log('  VALIDATION L2 error: {:.4f}'.format(l2_err_valid))

        pt_time = (time.time() - pt_start) / 60.
        logger.info('losses a: {}'.format(np.array(pt_actor_losses)))
        logger.info('losses a (l2 norm of weights): {}'.format(np.array(pt_actor_losses_l2)))
        losses_pth = osp.join(statsdir, 'bc_losses.pkl')
        losses_l2_pth = osp.join(statsdir, 'bc_losses_l2_only.pkl')
        with open(losses_pth, 'wb') as fh:
            pickle.dump(losses_pth, fh)
        with open(losses_l2_pth, 'wb') as fh:
            pickle.dump(losses_l2_pth, fh)
        logger.info('Finished BC (no DAgger) in {:.1f}m.\n'.format(pt_time))
    # --------------------------------------------------------------------------

    # Back to their code. For cloth, `env.reset()` takes a while so we put it here.
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    # Daniel: this is how to get the cloth state for the teacher; depends on num_env.
    logger.info('nenvs={}, getting cloth points + teacher policy...'.format(nenvs))
    if nenvs == 1:
        # DummyVecEnv has an `envs` "list". Then an extra `.env` to get ClothEnv.
        cloth_env = (env.envs[0]).env
        pts = (cloth_env).cloth.pts
        logger.info('singleton env type: {}'.format(cloth_env))
        logger.info('len(points): {}'.format(len(pts)))
        teacher = OracleCornerPolicy()
        teacher.set_env_data_dummy(cloth_env)
        logger.info('teacher attributes: {}'.format(teacher.get_info()))
        teacher_list = [ teacher ]
    else:
        # SubprocVecEnv, not sure if we can obtain envs or if it's safe, so I did this.
        env_attr = env.get_cloth_attributes()
        logger.info('env attributes: {}'.format(env_attr))
        teacher_list = []
        assert len(env_attr) == nenvs, len(env_attr)
        for env_a in env_attr:
            teacher = OracleCornerPolicy()
            teacher.set_env_data_subproc(env_a[0], env_a[1], env_a[2])
            teacher_list.append(teacher)

    # Daniel: Debugging/sanity checks.
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    U.display_var_info(_variables)
    logger.info("\nInside IMIT, about to start epochs")
    logger.info("nbatchsize: {}, get this in buffer before IMIT updates".format(nbatchsize))
    logger.info("  i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format(
            nenvs, nb_epoch_cycles, nb_rollout_steps))
    logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs))
    logger.info("eval_env None? {}".format(eval_env is None))
    logger.info("(end of debugging messages)\n")

    # File paths.
    checkdir    = osp.join(logger.get_dir(), 'checkpoints')
    action_dir  = osp.join(logger.get_dir(), 'actions')
    episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
    os.makedirs(checkdir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    os.makedirs(episode_dir, exist_ok=True)

    # Daniel: use these two to store past 100 episode history. Report these stats!
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    all_eval_episode_rewards = []

    # reward/step: cumulative quantities for each episode in vecenv.
    # epoch_{actions,qs} will grow without bound, fyi.
    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar
    epoch = 0
    start_time = time.time()
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_episodes = 0

    for epoch in range(nb_epochs):
        mb_actions = []
        mb_epinfos = []
        for cycle in range(nb_epoch_cycles):
            # Daniel: pure data collection (NO noise added) to populate replay buffer.
            # No training until after this, and note the parallel stepping (VecEnv).

            for t_rollout in range(nb_rollout_steps):
                # Predict next action: (#_parallel_envs, ac_dim).
                action, _, _, _ = agent.step(obs)
                if rank == 0 and render:
                    env.render()

                # Before environment stepping happens, we need to run the teacher
                # policy here, because it needs the SAME EXACT env state. The policy
                # does not apply on the obs but the INTERNAL env.cloth.pts.
                t_actions = []
                if nenvs > 1:
                    cloth_objs = env.get_cloth_objs()
                    for teacher, cloth in zip(teacher_list, cloth_objs):
                        t_act = teacher.get_action(cloth=cloth)
                        t_actions.append(t_act)
                else:
                    for teacher in teacher_list:
                        t_act = teacher.get_action()
                        t_actions.append(t_act)
                t_actions = np.array(t_actions)
                t_actions = np.maximum( np.minimum(t_actions,1.0), -1.0)
                logger.info('agent actions:\n{}'.format(action))
                logger.info('teacher actions:\n{}'.format(t_actions))
                logger.info('L2 diff: \n{:.4f}'.format(
                        np.mean( np.mean((action-t_actions)**2, axis=1) )
                ))

                # max_action is of dimension A, whereas action is dimension
                # (nenvs, A) - the multiplication gets broadcasted to the batch
                # scale for execution in env (as far as DDPG is concerned,
                # every action is in [-1, 1])
                new_obs, r, done, info = env.step(max_action * action)
                r = r.astype(np.float32)
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1
                epoch_actions.append(action)

                # Daniel: Same as PPO2/DDPG, just checking for end of episodes.
                mb_actions.append(action)
                for inf in info:
                    maybeepinfo = inf.get('episode')
                    if maybeepinfo:
                        mb_epinfos.append(inf)

                # The batched data will be unrolled in memory.py's append. Daniel:
                # unlike DDPG, we only need obs/act, but act is FROM THE EXPERT.
                # Unfortunately there will be duplicate obs/act pairs if the student
                # actions don't touch the cloth, but I'm not sure how to avoid that.
                agent.store_transition(obs, t_actions)
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])    # Entire history
                        episode_rewards_history.append(episode_reward[d])  # Last 100 only
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1

            # Train.
            epoch_actor_losses = []
            epoch_actor_losses_l2 = []
            for t_train in range(nb_train_steps):
                al, al_l2 = agent.train(return_l2=True)
                epoch_actor_losses.append(al)
                epoch_actor_losses_l2.append(al_l2)

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = {}
        combined_stats['memory/nb_entries'] = memory.nb_entries
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_actor_l2'] = np.mean(epoch_actor_losses_l2)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        # Total statistics.
        combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)
        combined_stats = {k : v / mpi_size for (k,v) in
                zip(combined_stats.keys(), combined_stats_sums)}
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps_per_env'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)

        # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e.
        if cycle % save_interval == 0:
            logger.info('We are now saving stuff!!')
            savepath = osp.join(checkdir, '%.5i'%epoch)
            logger.info('Saving model checkpoint to: ', savepath)
            agent.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging.
            mb_actions = _sf01(np.asarray(mb_actions))
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl'%epoch)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl'%epoch)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(mb_actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(mb_epinfos, fh)

        # Daniel: we were not resetting earlier. Actually there are other
        # epoch_stats which we might consider resetting here?
        epoch_episodes = 0

    return agent
    def __init__(self, agent, network, nsteps, rho, max_kl, ent_coef,
                 vf_stepsize, vf_iters, cg_damping, cg_iters, seed, load_path,
                 **network_kwargs):
        super(AgentModel, self).__init__(name='MATRPOModel')
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.max_kl = max_kl
        self.ent_coef = ent_coef
        self.cg_damping = cg_damping
        self.cg_iters = cg_iters
        self.vf_stepsize = vf_stepsize
        self.vf_iters = vf_iters

        set_global_seeds(seed)

        np.set_printoptions(precision=3)

        if MPI is not None:
            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
        else:
            self.nworkers = 1
            self.rank = 0

        # Setup losses and stuff
        # ----------------------------------------
        ob_space = agent.observation_space
        ac_space = agent.action_space

        with tf.name_scope(agent.name):
            if isinstance(network, str):
                network = get_network_builder(network)(**network_kwargs)
            with tf.name_scope("pi"):
                pi_policy_network = network(ob_space.shape)
                pi_value_network = network(ob_space.shape)
                self.pi = pi = PolicyWithValue(ac_space, pi_policy_network,
                                               pi_value_network)
            with tf.name_scope("oldpi"):
                old_pi_policy_network = network(ob_space.shape)
                old_pi_value_network = network(ob_space.shape)
                self.oldpi = oldpi = PolicyWithValue(ac_space,
                                                     old_pi_policy_network,
                                                     old_pi_value_network)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.zeros([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.ones([self.agent.nmates,
                                    self.nsteps]).astype(np.float32)

        pi_var_list = pi_policy_network.trainable_variables + list(
            pi.pdtype.trainable_variables)
        old_pi_var_list = old_pi_policy_network.trainable_variables + list(
            oldpi.pdtype.trainable_variables)
        vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables
        old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables

        self.pi_var_list = pi_var_list
        self.old_pi_var_list = old_pi_var_list
        self.vf_var_list = vf_var_list
        self.old_vf_var_list = old_vf_var_list

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=pi)
            load_path = load_path + '/agent_{}'.format(self.agent.id)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)
            print(
                colorize('Agent{}\'s Model restored!'.format(self.agent.id),
                         color='magenta'))

        self.vfadam = MpiAdam(vf_var_list)

        self.get_flat = U.GetFlat(pi_var_list)
        self.set_from_flat = U.SetFromFlat(pi_var_list)
        self.loss_names = [
            "Lagrange", "surrgain", "sync", "meankl", "entloss", "entropy"
        ]
        self.shapes = [var.get_shape().as_list() for var in pi_var_list]
Beispiel #51
0
def learn(network,
          env,
          seed,
          total_timesteps=int(40e6),
          gamma=0.99,
          log_interval=1,
          nprocs=32,
          nsteps=20,
          ent_coef=0.01,
          vf_coef=0.5,
          vf_fisher_coef=1.0,
          lr=0.25,
          max_grad_norm=0.5,
          kfac_clip=0.001,
          save_interval=None,
          lrschedule='linear',
          load_path=None,
          **network_kwargs):
    set_global_seeds(seed)

    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda: Model(policy,
                               ob_space,
                               ac_space,
                               nenvs,
                               total_timesteps,
                               nprocs=nprocs,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               vf_fisher_coef=vf_fisher_coef,
                               lr=lr,
                               max_grad_norm=max_grad_norm,
                               kfac_clip=kfac_clip,
                               lrschedule=lrschedule)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    enqueue_threads = model.q_runner.create_threads(model.sess,
                                                    coord=coord,
                                                    start=True)
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Beispiel #52
0
                    default='MountainCarContinuous-v0')
parser.add_argument('--num_timesteps',
                    dest='num_timesteps',
                    type=int,
                    default=10000)
parser.add_argument('--seed', help='RNG seed', type=int, default=0)
args = parser.parse_args()

sess = U.single_threaded_session()
sess.__enter__()

rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
    logger.set_level(logger.DISABLED)
workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)

# create the environment
env = gym.make(str(args.environment))
# initial_observation = env.reset()


def policy_fn(name, ob_space, ac_space):
    return MlpPolicy(name=name,
                     ob_space=env.observation_space,
                     ac_space=env.action_space,
                     hid_size=32,
                     num_hid_layers=2)


# env = bench.Monitor(env, logger.get_dir() and
Beispiel #53
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
Beispiel #54
0
def learn(network,
          env,
          seed=None,
          nsteps=5,
          total_timesteps=int(80e6),
          vf_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=100,
          load_path=None,
          **network_kwargs):
    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)

    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    alpha:              float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy,
                  env=env,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # Start total timer
    tstart = time.time()

    for update in range(1, total_timesteps // nbatch + 1):

        env.render()

        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predictor of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
Beispiel #55
0
def learn(policy,
          env,
          seed,
          nsteps=5,
          nstack=1,
          total_timesteps=int(80e6),
          vf_coef=0.9,
          ent_coef=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          lrschedule='linear',
          epsilon=1e-5,
          alpha=0.99,
          gamma=0.99,
          log_interval=20,
          continuous_actions=True,
          debug=False,
          numAgents=2,
          continueTraining=False,
          particleEnv=False,
          model_name='May15_test_model_',
          communication=False):
    timesteps = 100000
    print('nsteps:', nsteps)
    # time.sleep(1)
    tf.reset_default_graph()
    # if particleEnv == False:
    set_global_seeds(seed)
    avg_rwd_data = []
    episode_rwd_data = []
    timesteps_data = []
    policy_loss_data = [[], []]
    value_loss_data = [[], []]
    exp_var_data = [[], []]
    success_data = []
    nenvs = env.num_envs
    print('Number of Environments: ', nenvs)
    print('Number of Steps', nsteps)
    nbatch = nenvs * nsteps
    print('Batch Size: ', nbatch)
    print('Learning Rate: ', lr)
    print('debug: ', debug)
    print('---------------------------------------------')
    ob_space = env.observation_space
    ac_space = env.action_space
    # print(ac_space)
    # print('action space: ', ac_space)
    num_procs = len(env.remotes)  # HACK
    if numAgents == 1:
        model = Model(policy=policy,
                      ob_space=ob_space,
                      ac_space=ac_space,
                      nenvs=nenvs,
                      nsteps=nsteps,
                      nstack=nstack,
                      num_procs=num_procs,
                      ent_coef=ent_coef,
                      vf_coef=vf_coef,
                      max_grad_norm=max_grad_norm,
                      lr=lr,
                      alpha=alpha,
                      epsilon=epsilon,
                      total_timesteps=total_timesteps,
                      lrschedule=lrschedule,
                      continuous_actions=continuous_actions,
                      debug=debug)
    else:
        model = []
        for i in range(numAgents):
            if particleEnv == True:
                model.append(
                    Model(policy=policy,
                          ob_space=ob_space,
                          ac_space=ac_space,
                          nenvs=nenvs,
                          nsteps=nsteps,
                          nstack=nstack,
                          num_procs=num_procs,
                          ent_coef=ent_coef,
                          vf_coef=vf_coef,
                          max_grad_norm=max_grad_norm,
                          lr=lr,
                          alpha=alpha,
                          epsilon=epsilon,
                          total_timesteps=total_timesteps,
                          lrschedule=lrschedule,
                          continuous_actions=continuous_actions,
                          debug=debug,
                          itr=i,
                          particleEnv=particleEnv,
                          communication=communication))
            else:
                model.append(
                    Model(policy=policy,
                          ob_space=ob_space,
                          ac_space=ac_space,
                          nenvs=nenvs,
                          nsteps=nsteps,
                          nstack=nstack,
                          num_procs=num_procs,
                          ent_coef=ent_coef,
                          vf_coef=vf_coef,
                          max_grad_norm=max_grad_norm,
                          lr=lr,
                          alpha=alpha,
                          epsilon=epsilon,
                          total_timesteps=total_timesteps,
                          lrschedule=lrschedule,
                          continuous_actions=continuous_actions,
                          debug=debug,
                          itr=i,
                          particleEnv=particleEnv))
    # print('learn models')
    # print(model)
    # print(model[0])
    if continueTraining == True:
        for i in range(numAgents):
            m_name = model_name + str(i) + '.pkl'  # +'_600k.pkl'
            model[i].load(m_name)
            print('---------------------------------------------')
            print('Successfully Loaded ', m_name)
            print('---------------------------------------------')

    if numAgents == 1 or particleEnv == True:
        # print('Model: ', model)
        runner = Runner(env,
                        model,
                        nsteps=nsteps,
                        nstack=nstack,
                        gamma=gamma,
                        particleEnv=particleEnv)
    else:
        runner = []
        for i in range(numAgents):
            if i == 0:
                # print('env: ', env)
                runner.append(
                    Runner(env,
                           model[i],
                           nsteps=nsteps,
                           nstack=nstack,
                           gamma=gamma,
                           ind=i,
                           particleEnv=particleEnv))
                # print('runner model values')
                # print(model[i].value)
            else:
                runner.append(
                    Runner(env,
                           model[i],
                           nsteps=nsteps,
                           nstack=nstack,
                           gamma=gamma,
                           ind=i,
                           init_obs=runner[0].init_obs,
                           particleEnv=particleEnv))

    tstart = time.time()
    stored_rewards = [[], []]
    success = []
    percent_exp = []
    percent_exp_data = []
    for update in range(1, total_timesteps // nbatch + 1):
        if numAgents == 1:
            obs, states, rewards, masks, actions, values = runner.run()
        elif particleEnv == True:  # rework to pre-sort data by agent
            obs = [[], []]
            states = [[], []]
            rewards = [[], []]
            masks = [[], []]
            actions = [[], []]
            values = [[], []]
            policy_loss = []
            value_loss = []
            policy_entropy = []
            ev = []
            for i in range(nsteps):
                obs_, states_, rewards_, masks_, actions_, values_, success_ = runner.run(
                )
                # print('Runner Shapes')
                # print(obs_.shape)
                # print(rewards_.shape)
                # print('states: ', states_)
                # print('masks: ', masks_)
                # print('actions_ ', actions_)
                # print('values: ', values_[:, 0])
                # assert 1==0
                # print('masks: ', masks_)
                # print(rewards_)
                if success_:
                    success.append(success_)
                for j in range(numAgents):
                    if runner.env.name == 'simple_reference':
                        obs[j].append(obs_[:, :, j])
                        states[j].append(states[0])
                        rewards[j].append(rewards_[:, :, j])
                        actions[j].append(actions_[:, j])
                        values[j].append(values_[j])
                        masks[j].append(masks_[j])
                    else:
                        # print('values: ', values_[j, :])
                        obs[j].append(obs_[:, :, j])
                        states[j].append(states[0])
                        rewards[j].append(rewards_[:, :, j])
                        actions[j].append(actions_[j, :])
                        values[j].append(values_[j, :])
                        masks[j].append(masks_[j, :])
                    # print('reward: ', rewards_[:, :, j])
                    stored_rewards[j].append(rewards_[:, :, j])
                    # print('rewards: ', rewards_[:, :, j])

        else:
            obs = [[], []]
            states = [[], []]
            rewards = [[], []]
            masks = [[], []]
            actions = [[], []]
            values = [[], []]
            policy_loss = []
            value_loss = []
            policy_entropy = []
            ev = []
            # print('nsteps: ', runner[0].nsteps)
            for j in range(nsteps):
                for i in range(
                        numAgents
                ):  # Need to rewrite so that agents take turns stepping
                    # obs[i], states[i], rewards[i], masks[i], actions[i], values[i] = runner[i].run()
                    obs_, states_, rewards_, masks_, actions_, values_, = runner[
                        i].run()
                    obs[i].append(obs_)  # (obs_.shape = (6, 3, 3, 84))
                    states[i].append(states_)
                    rewards[i].append(rewards_)
                    masks[i].append(masks_)
                    actions[i].append(actions_)
                    values[i].append(values_)
                    if runner[i].dones[0] == True:
                        for k in range(nenvs):
                            percent_exp.append(
                                runner[i].env.envs[k].env.percent_explored[i])
                    stored_rewards[i].append(rewards_)
            # print(np.asarray(values).shape)
        if numAgents == 1:
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values)
        elif particleEnv == False:
            for i in range(numAgents):
                # print(masks[i])
                np.asarray(values[i]).reshape(nbatch)
                np.asarray(rewards[i]).reshape(nbatch)
                np.asarray(masks[i]).reshape(nbatch)
                policy_loss_, value_loss_, policy_entropy_ = model[i].train(
                    np.asarray(obs[i]).reshape(nbatch, 84, 84, 3), states[i],
                    np.asarray(rewards[i]).reshape(nbatch),
                    np.asarray(masks[i]).reshape(nbatch),
                    np.asarray(actions[i]).reshape(nbatch),
                    np.asarray(values[i]).reshape(nbatch))
                policy_loss.append(policy_loss_)
                value_loss.append(value_loss_)
                policy_entropy.append(policy_entropy_)
                ev.append(
                    explained_variance(
                        np.asarray(values[i]).reshape(nbatch),
                        np.asarray(rewards[i]).reshape(nbatch)))
        else:
            if runner.env.name == 'simple_reference':
                actions_per_agent = 2
                actions = np.asarray(actions).swapaxes(0, 2)
            for i in range(numAgents):
                if runner.env.name == 'simple_reference':
                    # print(np.asarray(actions).shape)
                    actions_i = np.asarray(actions[i])
                    actions_i = actions_i.swapaxes(0, 1).reshape(
                        actions_per_agent, nbatch)
                    action_n = [actions_i[1], actions_i[0]]
                    policy_loss_, value_loss_, policy_entropy_ = model[
                        i].train(
                            np.asarray(obs[i]).reshape(nbatch, 21), states[i],
                            np.asarray(rewards[i]).reshape(nbatch),
                            np.asarray(masks[i]).reshape(nbatch), action_n,
                            np.asarray(values[i]).reshape(nbatch))
                elif runner.env.name == 'simple_speaker_listener' or runner.env.name == 'simple_push':
                    actions_ = np.asarray(actions)
                    actions_i = actions_[i, :, :]
                    # print('obs shape: ', np.asarray(obs).shape)
                    obs_ = np.asarray(obs).swapaxes(1, 2)
                    obs_i = obs_[i, 0].flatten()
                    obs_n = []
                    for n in range(obs_i.shape[0]):
                        for m in range(obs_i[0].shape[0]):
                            obs_n.append(obs_i[n][m])
                    policy_loss_, value_loss_, policy_entropy_ = model[
                        i].train(
                            np.asarray(obs_n).reshape(
                                nbatch, runner.batch_obs_shape[i][0]),
                            states[i],
                            np.asarray(rewards[i]).reshape(nbatch),
                            np.asarray(masks[i]).reshape(nbatch),
                            np.asarray(actions_i).reshape(nbatch),
                            np.asarray(values[i]).reshape(nbatch))
                else:
                    actions_ = np.asarray(actions)
                    actions_i = actions_[i, :, :]
                    policy_loss_, value_loss_, policy_entropy_ = model[
                        i].train(
                            np.asarray(obs[i]).reshape(
                                nbatch, runner.batch_obs_shape[i][0]),
                            states[i],
                            np.asarray(rewards[i]).reshape(nbatch),
                            np.asarray(masks[i]).reshape(nbatch),
                            np.asarray(actions_i).reshape(nbatch),
                            np.asarray(values[i]).reshape(nbatch))
                policy_loss.append(policy_loss_)
                value_loss.append(value_loss_)
                policy_entropy.append(policy_entropy_)
                ev.append(
                    explained_variance(
                        np.asarray(values[i]).reshape(nbatch),
                        np.asarray(rewards[i]).reshape(nbatch)))

        # model.step_model.summarize_weights()

        nseconds = time.time() - tstart
        fps = float((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # print(success)
            # print(np.sum(np.asarray(success).flatten()))
            # print(np.size(np.asarray(success).flatten()))
            success_rate = np.mean(success)
            success_data.append(success_rate)
            percent_exp_data.append(np.mean(percent_exp))
            sio.savemat('percent_exp.mat',
                        {'percent_exp': np.asarray(percent_exp_data)})
            sio.savemat('success.mat',
                        {'success_rate': np.asarray(success_data)})
            for i in range(numAgents):
                logger.record_tabular("*Model Number*", i)
                logger.record_tabular("nupdates", update)
                logger.record_tabular("total_timesteps", update * nbatch)
                logger.record_tabular("fps", float(fps))
                rewards_i = np.asarray(stored_rewards[i])
                # print(rewards_i)
                avg_rwd = np.mean(rewards_i.flatten())
                # print('rwd shape: ', rewards_i.shape)
                episode_reward = []
                for j in range(nenvs):
                    if particleEnv == False:
                        episode_reward.append(np.mean(rewards_i[:, j]))
                    else:
                        episode_reward.append(np.mean(rewards_i[:, :, j]))
                print('min_eps_rwd: ', np.min(episode_reward))
                print('max_eps_rwd: ', np.max(episode_reward))
                # print('all rewards: ', rewards_i)
                logger.record_tabular("average agent reward", avg_rwd)
                avg_rwd_data.append(avg_rwd)
                episode_rwd_data.append(episode_reward)
                timesteps_data.append(update * nbatch)
                value_loss_data[i].append(float(value_loss[i]))
                exp_var_data[i].append(float(ev[i]))
                # percent_exp_data.append(np.mean(percent_exp))
                sio.savemat('reward_data.mat',
                            {'avg_rwd': np.asarray(avg_rwd_data)})
                sio.savemat('episode_reward_data.mat',
                            {'episode_avg_rwd': np.asarray(episode_rwd_data)})
                sio.savemat('timesteps.mat',
                            {'timesteps': np.asarray(timesteps_data)})
                # logger.record_tabular("average episode reward", np.mean(episode_reward))
                if particleEnv == False:
                    policy_loss_data[i].append(float(policy_loss[i]))
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy[i]))
                    logger.record_tabular("value_loss", float(value_loss[i]))
                    logger.record_tabular("policy_loss", float(policy_loss[i]))
                    logger.record_tabular("explained_variance", ev[i])
                    # logger.record_tabular("success rate", success_rate)
                    logger.record_tabular("percent_explored",
                                          float(np.mean(percent_exp)))
                    logger.dump_tabular()
                elif runner.env.name != 'simple_reference':
                    policy_loss_data[i].append(float(policy_loss[i]))
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy[i]))
                    logger.record_tabular("value_loss", float(value_loss[i]))
                    logger.record_tabular("policy_loss", float(policy_loss[i]))
                    logger.record_tabular("explained_variance", ev[i])
                    logger.record_tabular("success rate", success_rate)
                    # logger.record_tabular("percent_explored", float(np.mean(percent_exp))
                    logger.dump_tabular()
                else:
                    policy_loss_data[i].append(
                        [float(policy_loss[i][0]),
                         float(policy_loss[i][0])])
                    logger.record_tabular("comm_policy_entropy",
                                          float(policy_entropy[i][0]))
                    logger.record_tabular("force_policy_entropy",
                                          float(policy_entropy[i][1]))
                    logger.record_tabular("value_loss", float(value_loss[i]))
                    logger.record_tabular("comm_policy_loss",
                                          float(policy_loss[i][0]))
                    logger.record_tabular("force_policy_loss",
                                          float(policy_loss[i][1]))
                    logger.record_tabular("explained_variance", ev[i])
                    logger.record_tabular("success rate", success_rate)
                    logger.dump_tabular()
                sio.savemat('policy_loss.mat',
                            {'policy_loss': np.asarray(policy_loss_data)})
                sio.savemat('value_loss.mat',
                            {'value_loss': np.asarray(value_loss_data)})
                sio.savemat('exp_var.mat',
                            {'exp_var': np.asarray(exp_var_data)})
                m_name = model_name + str(i) + '.pkl'
                model[i].save(m_name)
                # print('Saving model as ', m_name)
            stored_rewards = [[], []]
            success = []
            percent_exp = []

        if particleEnv == False:
            if update * nbatch > timesteps:
                print('Saving ')
                for i in range(numAgents):
                    m_name = model_name + str(i) + '_' + str(
                        (timesteps + 600000.0) / 1000.0) + 'k.pkl'
                    model[i].save(m_name)
                timesteps += 100000
        else:
            update_interval = 10000
            if update % update_interval == 0:
                print('Saving for update ', str(update))
                for i in range(numAgents):
                    m_name = model_name + str(i) + '_' + str(
                        update / 1000.0) + 'k.pkl'
                    model[i].save(m_name)
    env.close()
Beispiel #56
0
def learn(*,
          network,
          env,
          total_timesteps,
          dtarg=0.01,
          adaptive_kl=0,
          trunc_rho=1.0,
          useadv=0,
          vtrace=0,
          rgae=0,
          eval_env=None,
          seed=None,
          ERlen=1,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=None,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = 1

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space
    acdim = ac_space.shape[0]

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               adaptive_kl=adaptive_kl)
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = EvalRunner(env=eval_env,
                                 model=model,
                                 nsteps=10 * nsteps,
                                 gamma=gamma,
                                 lam=lam)
        eval_runner.obfilt = runner.obfilt
        eval_runner.rewfilt = runner.rewfilt

    epinfobuf = deque(maxlen=10)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=10)

    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch

    def add_vtarg_and_adv(seg, gamma, value, lam):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        done = np.append(
            seg["done"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1

        T = len(seg["rew"])
        gaelam = np.empty(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - done[t + 1]
            delta = rew[t] + gamma * value[t + 1] * nonterminal - value[t]
            gaelam[
                t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
        ret = gaelam + value[:-1]
        return gaelam, ret

    def add_vtarg_and_adv_vtrace(seg,
                                 gamma,
                                 value,
                                 rho,
                                 trunc_rho,
                                 acdim=None):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        done = np.append(
            seg["done"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1
        rho_ = np.append(rho, 1.0)
        if acdim is not None:
            rho_ = np.exp(np.log(rho_) / acdim)

        r = np.minimum(trunc_rho, rho_)
        c = lam * np.minimum(1.0, rho_)
        T = len(seg["rew"])
        gaelam = np.empty(T, 'float32')
        gaelam2 = np.empty(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - done[t + 1]
            delta = (rew[t] + gamma * value[t + 1] * nonterminal - value[t])
            gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam
            lastgaelam = r[t] * gaelam[t]
        ret = r[:-1] * gaelam + value[:-1]
        adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T]
                                                          ]) - value[:-1]
        return adv, ret, r[:-1] * gaelam

    def add_vtarg_and_adv_vtrace4(seg,
                                  gamma,
                                  value,
                                  rho,
                                  trunc_rho,
                                  acdim=None):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        done = np.append(
            seg["done"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1
        rho_ = np.append(rho, 1.0)
        if acdim is not None:
            rho_ = np.exp(np.log(rho_) / acdim)

        T = len(seg["rew"])
        gaelam = np.zeros(T, 'float32')
        rew = runner.rewfilt(seg["rew"])
        delta = (rew + gamma * value[1:] * (1.0 - done[1:]) - value[:-1])
        gamlam = np.zeros(T, 'float32')
        for i in range(T):
            gamlam[i] = (gamma * lam)**i
        idx = T
        c = np.ones(T)
        for t in reversed(range(T)):
            # print(delta2)
            for j in range(t, T):
                if done[j + 1]:
                    idx = j + 1
                break
            gaelam[t] = np.sum(gamlam[:idx - t] *
                               (np.minimum(1.0, c) * delta)[t:idx])
            c[t:] = rho_[t] * c[t:]

        ret = np.minimum(trunc_rho, rho_[:-1]) * gaelam + value[:-1]
        adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T]
                                                          ]) - value[:-1]
        return adv, ret, np.minimum(trunc_rho, rho_[:-1]) * gaelam

    seg = None
    cliprangenow = cliprange(1.0)
    klconst = 1.0
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange

        # Get minibatch
        if seg is None:
            prev_seg = seg
            seg = {}
        else:
            prev_seg = {}
            for i in seg:
                prev_seg[i] = np.copy(seg[i])
        seg["ob"], seg["rew"], seg["done"], seg["ac"], seg["neglogp"], seg[
            "mean"], seg[
                "logstd"], final_obs, final_done, epinfos = runner.run()  #pylint: disable=E0632
        # print(np.shape(seg["ob"]))
        if prev_seg is not None:
            for key in seg:
                if len(np.shape(seg[key])) == 1:
                    seg[key] = np.hstack([prev_seg[key], seg[key]])
                else:
                    seg[key] = np.vstack([prev_seg[key], seg[key]])
                if np.shape(seg[key])[0] > ERlen * nsteps:
                    seg[key] = seg[key][-ERlen * nsteps:]

        ob_stack = np.vstack([seg["ob"], final_obs])
        values = model.values(runner.obfilt(ob_stack))
        values[:-1] = (1.0 - final_done) * values[:-1]
        mean_now, logstd_now = model.meanlogstds(runner.obfilt(seg["ob"]))
        # print(np.shape(seg["ac"])[1])
        neglogpnow = 0.5 * np.sum(np.square((seg["ac"] - mean_now) / np.exp(logstd_now)), axis=-1) \
                      + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \
                      + np.sum(logstd_now, axis=-1)
        rho = np.exp(-neglogpnow + seg["neglogp"])
        # print(len(mean_now))
        # print(cliprangenow)
        # print(rho)
        if vtrace == 1:
            adv, ret, gae = add_vtarg_and_adv_vtrace(seg, gamma, values, rho,
                                                     trunc_rho)
            if useadv:
                gae = adv
        elif vtrace == 4:
            adv, ret, gae = add_vtarg_and_adv_vtrace4(seg, gamma, values, rho,
                                                      trunc_rho)
            if useadv:
                gae = adv
        else:
            gae, ret = add_vtarg_and_adv(seg, gamma, values, lam)
        r = np.minimum(1.0, rho)
        r_gae = gae * r
        print("======")
        print(gae)
        print(r_gae)
        print(gae.mean())
        print(r_gae.mean())
        print(gae.std())
        print(r_gae.std())
        print(r.mean())
        print("======")

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, _, _, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632
        prior_row = np.zeros(len(seg["ob"]))
        temp_prior = []
        for i in range(int(len(prior_row) / nsteps)):
            temp_row = np.mean(
                np.abs(rho[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0)
            # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row)
            print(temp_row)
            temp_prior.append(temp_row)
            if temp_row > 1 + 0.2:
                prior_row[i * nsteps:(i + 1) * nsteps] = 0
            else:
                prior_row[i * nsteps:(i + 1) * nsteps] = 1
            prior_row[i * nsteps:(i + 1) * nsteps] = 1
        # print(prior_row)

        # for i in range(len(prior_row)):
        #     if (np.abs(rho[i] - 1.0) + 1.0)>1.05:
        #         prior_row[i]=0
        #     else:
        #         prior_row[i]=1
        # for i in range(len(prior_row)):
        #     if rho[i]>1.1 :
        #         prior_row[i]=0
        #     else:
        #         prior_row[i]=1
        prob = prior_row / np.sum(prior_row)

        print(np.sum(prior_row))

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        # Index of each element of batch_size
        # Create the indices array

        inds1 = np.arange(len(seg["ob"]) - nsteps)
        inds2 = np.arange(nsteps) + len(seg["ob"]) - nsteps
        print(len(seg["ob"]))
        print(cliprangenow)
        nbatch_adapt1 = int((len(seg["ob"]) - nsteps) / nsteps * nbatch_train)
        nbatch_adapt2 = int((nsteps) / nsteps * nbatch_train)
        print(rho)
        idx1 = []
        idx2 = []
        kl_rest = np.ones(len(seg["ob"])) * len(seg["ob"]) / nsteps
        kl_rest[:-nsteps] = 0
        # print(kl_rest)
        for _ in range(noptepochs):
            # Randomize the indexes
            # np.random.shuffle(inds)
            # 0 to batch_size with batch_train_size step

            # print(nbatch_adapt)
            losses_epoch = []
            for _ in range(int(nsteps / nbatch_train)):
                if nbatch_adapt1 > 0:
                    idx1 = np.random.choice(inds1, nbatch_adapt1)
                idx2 = np.random.choice(inds2, nbatch_adapt2)
                # print(np.mean(np.abs(rho[mbinds] - 1.0) + 1.0))
                idx = np.hstack([idx1, idx2]).astype(int)

                slices = (arr[idx]
                          for arr in (runner.obfilt(seg["ob"]), ret, gae,
                                      seg["done"], seg["ac"], values[:-1],
                                      neglogpnow, seg["mean"], seg["logstd"],
                                      kl_rest, rho))
                loss_epoch = model.train(lrnow, cliprangenow, klconst, rgae,
                                         trunc_rho, *slices)
                mblossvals.append(loss_epoch)
                losses_epoch.append(loss_epoch)

            # print(np.mean(losses_epoch, axis=0))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        if adaptive_kl:
            print("KL avg :", lossvals[3])
            if lossvals[3] > dtarg * 1.5:
                klconst *= 2
                print("kl const is increased")
            elif lossvals[3] < dtarg / 1.5:
                klconst /= 2
                print("kl const is reduced")
            klconst = np.clip(klconst, 2**(-10), 64)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values[:-1], ret)
            logger.logkv("batch IS weight",
                         [int(1000 * s) / 1000. for s in np.array(temp_prior)])
            logger.logkv("kl const", klconst)
            logger.logkv("clipping factor", cliprangenow)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfos]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfos]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Beispiel #57
0
def train():
    rank = MPI.COMM_WORLD.Get_rank()
    sess = utils.make_gpu_session(args.num_gpu)
    sess.__enter__()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    assert args.reload_dir is not None, "reload_dir cannot be None!"

    param_fname = os.path.join(args.reload_dir, 'param.json')
    with open(param_fname, 'r') as f:
        param = json.load(f)

    workerseed = param["seed"] + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    if param["use_2D_env"]:
        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'configs', 'husky_space7_ppo2_2D.yaml')
        raw_env = Husky2DNavigateEnv(gpu_idx=args.gpu_idx,
                                     config=config_file,
                                     pos_interval=param["pos_interval"])
    else:
        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'configs', 'husky_space7_ppo2.yaml')
        raw_env = Husky1DNavigateEnv(gpu_idx=args.gpu_idx,
                                     config=config_file,
                                     ob_space_range=[0.0, 40.0])

    # configure environment
    raw_env.reset_state_space(use_goal_info=param["use_goal_info"],
                              use_coords_and_orn=param["use_coords_and_orn"],
                              raycast_num=param["raycast_num"],
                              raycast_range=param["raycast_range"])
    raw_env.reset_goal_range(goal_range=param["goal_range"])

    env = Monitor(
        raw_env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    gym.logger.setLevel(logging.WARN)

    policy_fn = FeedbackPolicy

    print('here')
    base_dirname = os.path.join(currentdir, "simulation_and_analysis", "rslts")
    print(base_dirname)
    if not os.path.exists(base_dirname):
        os.mkdir(base_dirname)
    dir_name = "husky_ppo2_"
    if param["use_feedback"]:
        dir_name += "hr"
    elif param["use_rich_reward"]:
        dir_name += "rl_rich"
    else:
        dir_name += "rl_sparse"
    dir_name += "_reload"
    dir_name = addDateTime(dir_name)
    dir_name = os.path.join(base_dirname, dir_name)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    hyperparams = {
        "seed": args.seed,
        "nsteps": param["nsteps"],
        "total_timesteps": args.total_timesteps,
        "use_2D_env": param["use_2D_env"],
        "use_rich_reward": param["use_rich_reward"],
        "use_multiple_starts": param["use_multiple_starts"],
        "use_goal_info": param["use_goal_info"],
        "use_coords_and_orn": param["use_coords_and_orn"],
        "raycast_num": param["raycast_num"],
        "raycast_range": param["raycast_range"],
        "goal_range": param["goal_range"],
        "use_feedback": args.use_feedback,
        "use_real_feedback": args.use_real_feedback,
        "trans_by_interpolate": args.trans_by_interpolate,
        "only_use_hr_until": args.only_use_hr_until,
        "trans_to_rl_in": args.trans_to_rl_in,
        "good_feedback_acc": param["good_feedback_acc"],
        "bad_feedback_acc": param["bad_feedback_acc"],
        "ppo_lr": args.ppo_lr,
        "ppo_batch_size": args.ppo_batch_size,
        "ppo_minibatch_size": param["ppo_minibatch_size"],
        "init_rl_importance": args.init_rl_importance,
        "ent_coef": args.ent_coef,
        "gamma": args.gamma,
        "lambda": args.lam,
        "cliprange": args.cliprange,
        "max_grad_norm": args.max_grad_norm,
        "ppo_noptepochs": args.ppo_noptepochs,
        "feedback_lr": param["feedback_lr"],
        "feedback_batch_size": param["feedback_batch_size"],
        "feedback_minibatch_size": param["feedback_minibatch_size"],
        "feedback_noptepochs": param["feedback_noptepochs"],
        "min_feedback_buffer_size": param["min_feedback_buffer_size"],
        "feedback_training_prop": param["feedback_training_prop"],
        "feedback_training_new_prop": param["feedback_training_new_prop"],
        "pos_interval": param["pos_interval"],
        "use_embedding": raw_env._use_embedding,
        "use_raycast": raw_env._use_raycast,
        "offline": raw_env.config['offline'],
        "reload_dir": args.reload_dir,
        "prev_total_timesteps": param["total_timesteps"]
    }

    param_fname = os.path.join(dir_name, "param.json")
    with open(param_fname, "w") as f:
        json.dump(hyperparams, f, indent=4, sort_keys=True)

    video_name = os.path.join(dir_name, "video.mp4")
    p_logging = p.startStateLogging(p.STATE_LOGGING_VIDEO_MP4, video_name)

    model_dir = os.path.join(args.reload_dir, 'models')
    max_model_iter = -1
    for fname in os.listdir(model_dir):
        if fname.isdigit():
            model_iter = int(fname)
            if model_iter > max_model_iter:
                max_model_iter = model_iter
                reload_name = os.path.join(model_dir, fname)

    performance = learn(
        policy=policy_fn,
        env=env,
        raw_env=raw_env,
        use_2D_env=param["use_2D_env"],
        use_multiple_starts=param["use_multiple_starts"],
        use_rich_reward=param["use_rich_reward"],
        use_feedback=args.use_feedback,
        use_real_feedback=args.use_real_feedback,
        trans_by_interpolate=args.trans_by_interpolate,
        only_use_hr_until=args.only_use_hr_until,
        trans_to_rl_in=args.trans_to_rl_in,
        nsteps=param["nsteps"],
        total_timesteps=args.total_timesteps,
        ppo_lr=args.ppo_lr,
        cliprange=args.cliprange,
        max_grad_norm=args.max_grad_norm,
        ent_coef=args.ent_coef,
        gamma=args.gamma,
        lam=args.lam,
        ppo_noptepochs=args.ppo_noptepochs,
        ppo_batch_size=args.ppo_batch_size,
        ppo_minibatch_size=param["ppo_minibatch_size"],
        init_rl_importance=args.init_rl_importance,
        feedback_lr=param["feedback_lr"],
        feedback_noptepochs=param["feedback_noptepochs"],
        feedback_batch_size=param["feedback_batch_size"],
        feedback_minibatch_size=param["feedback_minibatch_size"],
        min_feedback_buffer_size=param["min_feedback_buffer_size"],
        feedback_training_prop=param["feedback_training_prop"],
        feedback_training_new_prop=param["feedback_training_new_prop"],
        good_feedback_acc=param["good_feedback_acc"],
        bad_feedback_acc=param["bad_feedback_acc"],
        log_interval=1,
        save_interval=5,
        reload_name=reload_name,
        base_path=dir_name)

    p.stopStateLogging(p_logging)

    performance_fname = os.path.join(dir_name, "performance.p")
    with open(performance_fname, "wb") as f:
        pickle.dump(performance, f)
Beispiel #58
0
parser.add_argument('--n_steps', type=float, default=1e8)
boolean_flag(parser, 'dueling', default=True)
boolean_flag(parser, 'norm', default=True)
boolean_flag(parser, 'double', default=True)
boolean_flag(parser, 'render', default=False)
args = parser.parse_args()
# n_steps = int(1e8)
n_steps = int(args.n_steps)
train_level = 'level1'
test_levels = ['level1', 'level2', 'level3']

# Create the environment.

env = GridWorld(train_level)
coords_shape = env.unwrapped.coords_shape
set_global_seeds(args.seed)
env.seed(args.seed)

print('~~~~~~~~~~~~~~~~~~~~~~')
print(env)
print(env.unwrapped.name)
print('observations:', env.observation_space.shape)
print('coords:     ', coords_shape)
print('actions:    ', env.action_space.n)
print('walls:      ', env.unwrapped.walls.shape)
print('~~~~~~~~~~~~~~~~~~~~~~')

# Generate the observations and ground truth Q-frames.

test_obs = []
test_qmaps = []
Beispiel #59
0
def learn(
    network,
    env,
    seed=None,
    nsteps=5,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    log_interval=100,
    load_path=None,
    **network_kwargs):

    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)

    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    alpha:              float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''



    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs*nsteps

    # Start total timer
    tstart = time.time()

    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart

        # Calculate the fps (frame per second)
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
Beispiel #60
0
def learn(
        *,
        network,
        env,
        eval_env,
        make_eval_env,
        env_id,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,

        # MBL
        # For train mbl
        mbl_train_freq=10,

        # For eval
        num_eval_episodes=5,
        eval_freq=5,
        vis_eval=False,
        eval_targs=('mbmf', 'mf'),
        #eval_targs=('mf',),
        quant=2,

        # For mbl.step
        #num_samples=(1500,),
        num_samples=(1, ),
        #horizon=(5,),
        horizon=(2, 1),
        #num_elites=(10,),
        num_elites=(1, ),
        mbl_lamb=(1.0, ),
        mbl_gamma=0.99,
        #mbl_sh=1, # Number of step for stochastic sampling
        mbl_sh=max((5, )),
        #vf_lookahead=-1,
        #use_max_vf=False,
        reset_per_step=(0, ),

        # For get_model
        num_fc=2,
        num_fwd_hidden=500,
        use_layer_norm=False,

        # For MBL
        num_warm_start=int(1e4),
        init_epochs=10,
        update_epochs=5,
        batch_size=512,
        update_with_validation=False,
        use_mean_elites=1,
        use_ent_adjust=0,
        adj_std_scale=0.5,

        # For data loading
        validation_set_path=None,

        # For data collect
        collect_val_data=False,

        # For traj collect
        traj_collect='mf',

        # For profile
        measure_time=True,
        eval_val_err=False,
        measure_rew=True,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''
    if not isinstance(num_samples, tuple): num_samples = (num_samples, )
    if not isinstance(horizon, tuple): horizon = (horizon, )
    if not isinstance(num_elites, tuple): num_elites = (num_elites, )
    if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, )
    if not isinstance(reset_per_step, tuple):
        reset_per_step = (reset_per_step, )
    if validation_set_path is None:
        if collect_val_data:
            validation_set_path = os.path.join(logger.get_dir(), 'val.pkl')
        else:
            validation_set_path = os.path.join('dataset',
                                               '{}-val.pkl'.format(env_id))
    if eval_val_err:
        eval_val_err_path = os.path.join('dataset',
                                         '{}-combine-val.pkl'.format(env_id))
    logger.log(locals())
    logger.log('MBL_SH', mbl_sh)
    logger.log('Traj_collect', traj_collect)

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0
    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    # MBL
    # ---------------------------------------
    viz = Visdom(env=env_id)
    win = None
    eval_targs = list(eval_targs)
    logger.log(eval_targs)

    make_model = get_make_mlp_model(num_fc=num_fc,
                                    num_fwd_hidden=num_fwd_hidden,
                                    layer_norm=use_layer_norm)
    mbl = MBL(env=eval_env,
              env_id=env_id,
              make_model=make_model,
              num_warm_start=num_warm_start,
              init_epochs=init_epochs,
              update_epochs=update_epochs,
              batch_size=batch_size,
              **network_kwargs)

    val_dataset = {'ob': None, 'ac': None, 'ob_next': None}
    if update_with_validation:
        logger.log('Update with validation')
        val_dataset = load_val_data(validation_set_path)
    if eval_val_err:
        logger.log('Log val error')
        eval_val_dataset = load_val_data(eval_val_err_path)
    if collect_val_data:
        logger.log('Collect validation data')
        val_dataset_collect = []

    def _mf_pi(ob, t=None):
        stochastic = True
        ac, vpred, _, _ = pi.step(ob, stochastic=stochastic)
        return ac, vpred

    def _mf_det_pi(ob, t=None):
        #ac, vpred, _, _ = pi.step(ob, stochastic=False)
        ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob)
        return ac, vpred

    def _mf_ent_pi(ob, t=None):
        mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob)
        ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape)
        return ac, vpred


################### use_ent_adjust======> adj_std_scale????????pi action sample

    def _mbmf_inner_pi(ob, t=0):
        if use_ent_adjust:
            return _mf_ent_pi(ob)
        else:
            if t < mbl_sh: return _mf_pi(ob)
            else: return _mf_det_pi(ob)

    # ---------------------------------------

    # Run multiple configuration once
    all_eval_descs = []

    def make_mbmf_pi(n, h, e, l):
        def _mbmf_pi(ob):
            ac, rew = mbl.step(ob=ob,
                               pi=_mbmf_inner_pi,
                               horizon=h,
                               num_samples=n,
                               num_elites=e,
                               gamma=mbl_gamma,
                               lamb=l,
                               use_mean_elites=use_mean_elites)
            return ac[None], rew

        return Policy(step=_mbmf_pi, reset=None)

    for n in num_samples:
        for h in horizon:
            for l in mbl_lamb:
                for e in num_elites:
                    if 'mbmf' in eval_targs:
                        all_eval_descs.append(
                            ('MeanRewMBMF-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.
                             format(n, h, e, l, mbl_sh, use_mean_elites),
                             'MBMF-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(
                                 n, h, e, l, mbl_sh,
                                 use_mean_elites), make_mbmf_pi(n, h, e, l)))
    if 'mf' in eval_targs:
        all_eval_descs.append(
            ('MeanRewMF', 'MF', Policy(step=_mf_pi, reset=None)))

    logger.log('List of evaluation targets')
    for it in all_eval_descs:
        logger.log(it[0])

    pool = Pool(mp.cpu_count())
    warm_start_done = False
    # ----------------------------------------

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)
    # Prepare for rollouts
    # ----------------------------------------
    if traj_collect == 'mf':
        seg_gen = traj_segment_generator(pi,
                                         env,
                                         timesteps_per_batch,
                                         stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
            if traj_collect == 'mf-random' or traj_collect == 'mf-mb':
                seg_mbl = seg_gen_mbl.__next__()
            else:
                seg_mbl = seg
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]

        # Val data collection
        if collect_val_data:
            for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...],
                                          ob[1:, 0, ...]):
                val_dataset_collect.append(
                    (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_)))
        # -----------------------------
        # MBL update
        else:
            ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"]

            mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...],
                               ob_mbl[1:, 0, ...])
            mbl.update_forward_dynamic(require_update=iters_so_far %
                                       mbl_train_freq == 0,
                                       ob_val=val_dataset['ob'],
                                       ac_val=val_dataset['ac'],
                                       ob_next_val=val_dataset['ob_next'])
        # -----------------------------

        if traj_collect == 'mf':
            #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb':
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
            if hasattr(pi, "rms"):
                pi.rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

            with timed("vf"):

                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=64):
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

            logger.record_tabular("ev_tdlam_before",
                                  explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            # MBL evaluation
            if not collect_val_data:
                set_global_seeds(seed)
                default_sess = tf.get_default_session()

                def multithread_eval_policy(env_, pi_, num_episodes_,
                                            vis_eval_, seed):
                    with default_sess.as_default():
                        if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'):
                            env_.ob_rms = env.ob_rms
                        res = eval_policy(env_, pi_, num_episodes_, vis_eval_,
                                          seed, measure_time, measure_rew)

                        try:
                            env_.close()
                        except:
                            pass
                    return res

                if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0:
                    warm_start_done = mbl.is_warm_start_done()
                    if num_eval_episodes > 0:
                        targs_names = {}
                        with timed('eval'):
                            num_descs = len(all_eval_descs)
                            list_field_names = [e[0] for e in all_eval_descs]
                            list_legend_names = [e[1] for e in all_eval_descs]
                            list_pis = [e[2] for e in all_eval_descs]
                            list_eval_envs = [
                                make_eval_env() for _ in range(num_descs)
                            ]
                            list_seed = [seed for _ in range(num_descs)]
                            list_num_eval_episodes = [
                                num_eval_episodes for _ in range(num_descs)
                            ]
                            print(list_field_names)
                            print(list_legend_names)

                            list_vis_eval = [
                                vis_eval for _ in range(num_descs)
                            ]

                            for i in range(num_descs):
                                field_name, legend_name = list_field_names[
                                    i], list_legend_names[i],

                                res = multithread_eval_policy(
                                    list_eval_envs[i], list_pis[i],
                                    list_num_eval_episodes[i],
                                    list_vis_eval[i], seed)
                                #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed))

                                #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results):
                                perf, elapsed_time, eval_rew = res
                                logger.record_tabular(field_name, perf)
                                if measure_time:
                                    logger.record_tabular(
                                        'Time-%s' % (field_name), elapsed_time)
                                if measure_rew:
                                    logger.record_tabular(
                                        'SimRew-%s' % (field_name), eval_rew)
                                targs_names[field_name] = legend_name

                    if eval_val_err:
                        fwd_dynamics_err = mbl.eval_forward_dynamic(
                            obs=eval_val_dataset['ob'],
                            acs=eval_val_dataset['ac'],
                            obs_next=eval_val_dataset['ob_next'])
                        logger.record_tabular('FwdValError', fwd_dynamics_err)

                    logger.dump_tabular()
                    #print(logger.get_dir())
                    #print(targs_names)
                    if num_eval_episodes > 0:
                        win = plot(viz,
                                   win,
                                   logger.get_dir(),
                                   targs_names=targs_names,
                                   quant=quant,
                                   opt='best')
            # -----------
        yield pi

    if collect_val_data:
        with open(validation_set_path, 'wb') as f:
            pickle.dump(val_dataset_collect, f)
        logger.log('Save {} validation data'.format(len(val_dataset_collect)))