def initialize_env_specs(hparams):
  """Initializes env_specs using T2TGymEnvs."""
  if getattr(hparams, "game", None):
    game_name = gym_env.camel_case_name(hparams.game)
    env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name),
                            batch_size=hparams.batch_size)
    env.start_new_epoch(0)
    hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
    eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name),
                                 batch_size=hparams.eval_batch_size)
    eval_env.start_new_epoch(0)
    hparams.add_hparam("eval_env_fn", rl.make_real_env_fn(eval_env))
  return hparams
def initialize_env_specs(hparams):
    """Initializes env_specs using T2TGymEnvs."""
    if getattr(hparams, "game", None):
        game_name = gym_env.camel_case_name(hparams.game)
        env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name),
                                batch_size=hparams.batch_size)
        env.start_new_epoch(0)
        hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
        eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name),
                                     batch_size=hparams.eval_batch_size)
        eval_env.start_new_epoch(0)
        hparams.add_hparam("eval_env_fn", rl.make_real_env_fn(eval_env))
    return hparams
def initialize_env_specs(hparams):
    """Initializes env_specs using T2TGymEnvs."""
    env = rl_utils.setup_env(hparams, hparams.batch_size,
                             hparams.eval_max_num_noops)
    env.start_new_epoch(0)
    hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
    return hparams
Esempio n. 4
0
def _eval_fn_with_learner(env, hparams, policy_hparams, policy_dir,
                          sampling_temp):
    env_fn = rl.make_real_env_fn(env)
    learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                          base_event_dir=None,
                                          agent_model_dir=policy_dir,
                                          total_num_epochs=1)
    learner.evaluate(env_fn, policy_hparams, sampling_temp)
def initialize_env_specs(hparams):
    """Initializes env_specs using T2TGymEnvs."""
    env = rl_utils.setup_env(hparams, hparams.batch_size,
                             hparams.eval_max_num_noops)
    env.start_new_epoch(0)

    # TODO(afrozm): Decouple env_fn from hparams and return both, is there
    # even a need to return hparams? Just return the env_fn?
    hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
    return hparams
Esempio n. 6
0
def initialize_env_specs(hparams):
    """Initializes env_specs using T2TGymEnvs."""
    env = rl_utils.setup_env(hparams,
                             hparams.batch_size,
                             hparams.eval_max_num_noops,
                             hparams.rl_env_max_episode_steps,
                             env_name=hparams.rl_env_name)

    env.start_new_epoch(0)

    return rl.make_real_env_fn(env)
def initialize_env_specs(hparams, env_problem_name):
  """Initializes env_specs using the appropriate env."""
  if env_problem_name:
    env = registry.env_problem(env_problem_name, batch_size=hparams.batch_size)
  else:
    env = rl_utils.setup_env(hparams, hparams.batch_size,
                             hparams.eval_max_num_noops,
                             hparams.rl_env_max_episode_steps,
                             env_name=hparams.rl_env_name)
    env.start_new_epoch(0)

  return rl.make_real_env_fn(env)
def train_agent_real_env(env, learner, hparams, epoch):
  """Train the PPO agent in the real environment."""
  base_algo_str = hparams.base_algo

  train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  update_hparams_from_hparams(
      train_hparams, hparams, "real_" + base_algo_str + "_"
  )

  env_fn = rl.make_real_env_fn(env)
  num_env_steps = real_env_step_increment(hparams)
  learner.train(
      env_fn, train_hparams, simulated=False, save_continuously=False,
      epoch=epoch, num_env_steps=num_env_steps
  )
  # Save unfinished rollouts to history.
  env.reset()
def train_agent_real_env(env, learner, hparams, epoch):
  """Train the PPO agent in the real environment."""
  base_algo_str = hparams.base_algo

  train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  update_hparams_from_hparams(
      train_hparams, hparams, "real_" + base_algo_str + "_"
  )

  env_fn = rl.make_real_env_fn(env)
  num_env_steps = real_env_step_increment(hparams)
  learner.train(
      env_fn, train_hparams, simulated=False, save_continuously=False,
      epoch=epoch, num_env_steps=num_env_steps
  )
  # Save unfinished rollouts to history.
  env.reset()
Esempio n. 10
0
def evaluate_single_config(hparams, stochastic, max_num_noops,
                           agent_model_dir):
    """Evaluate the PPO agent in the real environment."""
    eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    env = setup_env(hparams,
                    batch_size=hparams.eval_batch_size,
                    max_num_noops=max_num_noops)
    env.start_new_epoch(0)
    env_fn = rl.make_real_env_fn(env)
    learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                          base_event_dir=None,
                                          agent_model_dir=agent_model_dir)
    learner.evaluate(env_fn, eval_hparams, stochastic)
    rollouts = env.current_epoch_rollouts()
    env.close()

    return tuple(
        compute_mean_reward(rollouts, clipped) for clipped in (True, False))
Esempio n. 11
0
def evaluate_single_config(hparams, stochastic, max_num_noops, agent_model_dir):
  """Evaluate the PPO agent in the real environment."""
  eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  env = setup_env(
      hparams, batch_size=hparams.eval_batch_size, max_num_noops=max_num_noops
  )
  env.start_new_epoch(0)
  env_fn = rl.make_real_env_fn(env)
  learner = LEARNERS[hparams.base_algo](
      hparams.frame_stack_size, base_event_dir=None,
      agent_model_dir=agent_model_dir
  )
  learner.evaluate(env_fn, eval_hparams, stochastic)
  rollouts = env.current_epoch_rollouts()
  env.close()

  return tuple(
      compute_mean_reward(rollouts, clipped) for clipped in (True, False)
  )
Esempio n. 12
0
def evaluate_single_config(hparams, agent_model_dir):
    """Evaluate the PPO agent in the real environment."""
    eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    eval_hparams.num_agents = hparams.num_agents
    eval_hparams.add_hparam("stochastic", hparams.stochastic)
    env = setup_env(hparams, batch_size=hparams.num_agents)
    env.start_new_epoch(0)
    env_fn = rl.make_real_env_fn(env)
    learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                          event_dir=None,
                                          agent_model_dir=agent_model_dir)
    learner.evaluate(env_fn, eval_hparams, eval_hparams.stochastic)
    rollouts = env.current_epoch_rollouts()[:hparams.num_agents]
    env.close()

    assert len(rollouts) == hparams.num_agents, "{} {}".format(
        len(rollouts), hparams.num_agents)
    return tuple(
        compute_mean_reward(rollouts, clipped) for clipped in (True, False))
Esempio n. 13
0
def train_agent_real_env(env,
                         agent_model_dir,
                         event_dir,
                         data_dir,
                         hparams,
                         completed_ppo_epochs_num,
                         epoch=0,
                         is_final_epoch=False):
    """Train the PPO agent in the real environment."""
    del is_final_epoch, data_dir
    ppo_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    ppo_params_names = [
        "epochs_num", "epoch_length", "learning_rate", "num_agents",
        "eval_every_epochs", "optimization_epochs", "effective_num_agents"
    ]

    # This should be overridden.
    ppo_hparams.add_hparam("effective_num_agents", None)
    for param_name in ppo_params_names:
        ppo_param_name = "real_ppo_" + param_name
        if ppo_param_name in hparams:
            ppo_hparams.set_hparam(param_name, hparams.get(ppo_param_name))

    completed_ppo_epochs_num += real_ppo_epoch_increment(hparams)
    ppo_hparams.epochs_num = completed_ppo_epochs_num
    # We do not save model, as that resets frames that we need at restarts.
    # But we need to save at the last step, so we set it very high.
    ppo_hparams.save_models_every_epochs = 1000000

    ppo_hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
    ppo_hparams.add_hparam("force_beginning_resets", False)
    ppo_hparams.add_hparam("frame_stack_size", hparams.frame_stack_size)

    rl_trainer_lib.train(ppo_hparams,
                         event_dir + "real",
                         agent_model_dir,
                         name_scope="ppo_real%d" % (epoch + 1))

    # Save unfinished rollouts to history.
    env.reset()

    return completed_ppo_epochs_num
Esempio n. 14
0
def evaluate_single_config(hparams, agent_model_dir):
    """Evaluate the PPO agent in the real environment."""
    eval_hparams = trainer_lib.create_hparams(hparams.ppo_params)
    eval_hparams.num_agents = hparams.num_agents
    env = setup_env(hparams, batch_size=hparams.num_agents)
    env_fn = rl.make_real_env_fn(env)
    eval_hparams.add_hparam("env_fn", env_fn)
    eval_hparams.add_hparam("policy_to_actions_lambda",
                            hparams.policy_to_actions_lambda)
    eval_hparams.add_hparam("frame_stack_size", hparams.frame_stack_size)
    eval_hparams.add_hparam("force_beginning_resets", False)

    env.start_new_epoch(0)
    rl_trainer_lib.evaluate(eval_hparams, agent_model_dir)
    rollouts = env.current_epoch_rollouts()[:hparams.num_agents]
    env.close()

    assert len(rollouts) == hparams.num_agents
    return tuple(
        compute_mean_reward(rollouts, clipped) for clipped in (True, False))
Esempio n. 15
0
def train_agent_real_env(env, learner, hparams, epoch):
    """Train the PPO agent in the real environment."""
    base_algo_str = hparams.base_algo

    train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    rl_utils.update_hparams_from_hparams(train_hparams, hparams,
                                         "real_" + base_algo_str + "_")
    if hparams.wm_policy_param_sharing:
        train_hparams.optimizer_zero_grads = True

    env_fn = rl.make_real_env_fn(env)
    num_env_steps = real_env_step_increment(hparams)
    learner.train(
        env_fn,
        train_hparams,
        simulated=False,
        save_continuously=False,
        epoch=epoch,
        sampling_temp=hparams.real_sampling_temp,
        num_env_steps=num_env_steps,
    )
    # Save unfinished rollouts to history.
    env.reset()
Esempio n. 16
0
def train_agent_real_env(env,
                         agent_model_dir,
                         event_dir,
                         data_dir,
                         hparams,
                         completed_epochs_num,
                         epoch=0,
                         is_final_epoch=False):
    """Train the PPO agent in the real environment."""
    del is_final_epoch, data_dir

    base_algo_str = hparams.base_algo

    train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    _update_hparams_from_hparams(train_hparams, hparams,
                                 "real_" + base_algo_str + "_")

    # TODO(konradczechowski): add effective_num_agents to ppo_atari_base etc.
    # this requires refactoring ppo.
    # This should be overridden.
    train_hparams.add_hparam("effective_num_agents",
                             hparams.real_ppo_effective_num_agents)

    completed_epochs_num += real_ppo_epoch_increment(hparams)

    env_fn = rl.make_real_env_fn(env)
    learner = LEARNERS[base_algo_str](hparams.frame_stack_size, event_dir,
                                      agent_model_dir)
    learner.train(env_fn,
                  train_hparams,
                  completed_epochs_num,
                  simulated=False,
                  epoch=epoch)
    # Save unfinished rollouts to history.
    env.reset()

    return completed_epochs_num