Example #1
0
def run(bsuite_id: Text) -> Text:
    """Runs a PPO agent on a given bsuite environment, logging to CSV."""
    def _load_env():
        raw_env = bsuite.load_and_record(
            bsuite_id=bsuite_id,
            save_path=FLAGS.save_path,
            logging_mode=FLAGS.logging_mode,
            overwrite=FLAGS.overwrite,
        )
        if FLAGS.verbose:
            raw_env = terminal_logging.wrap_environment(raw_env,
                                                        log_every=True)
        return gym_wrapper.GymFromDMEnv(raw_env)

    env = dummy_vec_env.DummyVecEnv([_load_env])

    ppo2.learn(
        env=env,
        network=FLAGS.network,
        lr=FLAGS.learning_rate,
        total_timesteps=FLAGS.total_timesteps,  # make sure to run enough steps
        nsteps=FLAGS.nsteps,
        gamma=FLAGS.agent_discount,
    )

    return bsuite_id
Example #2
0
def run():
  """Runs a PPO agent on a given environment."""

  def _load_env():
    """Loads environment."""
    raw_env = rwrl.load(
        domain_name=FLAGS.domain_name,
        task_name=FLAGS.task_name,
        safety_spec=dict(enable=True),
        delay_spec=dict(enable=True, actions=20),
        log_output=os.path.join(FLAGS.save_path, 'log.npz'),
        environment_kwargs=dict(
            log_safety_vars=True, log_every=20, flat_observation=True))
    env = GymEnv(raw_env)
    env = bench.Monitor(env, FLAGS.save_path)
    return env

  env = dummy_vec_env.DummyVecEnv([_load_env])

  ppo2.learn(
      env=env,
      network=FLAGS.network,
      lr=FLAGS.learning_rate,
      total_timesteps=FLAGS.total_timesteps,  # make sure to run enough steps
      nsteps=FLAGS.nsteps,
      gamma=FLAGS.agent_discount,
  )
Example #3
0
def train(training_env, num_timesteps, seed):
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=_NUM_CPUS,
                            inter_op_parallelism_threads=_NUM_CPUS)
    tf.Session(config=config).__enter__()

    def make_env():
        return bench.Monitor(training_env,
                             logger.get_dir(),
                             allow_early_resets=True)

    env = dummy_vec_env.DummyVecEnv([make_env])
    env = vec_normalize.VecNormalize(env)

    misc_util.set_global_seeds(seed)
    policy = policies.MlpPolicy
    model = ppo2.learn(policy=policy,
                       env=env,
                       nsteps=4096,
                       nminibatches=32,
                       lam=0.95,
                       gamma=0.99,
                       noptepochs=32,
                       log_interval=1,
                       ent_coef=0.0,
                       lr=3e-4,
                       cliprange=0.2,
                       total_timesteps=num_timesteps,
                       save_interval=10)

    return model, env
Example #4
0
def replay(load_path, num_time_steps, render=False):
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=_NUM_CPUS,
                            inter_op_parallelism_threads=_NUM_CPUS)
    tf.Session(config=config).__enter__()

    sess = tf.get_default_session()

    training_env = build_environment(render=render)
    env = dummy_vec_env.DummyVecEnv([lambda: training_env])
    env = vec_normalize.VecNormalize(env)

    policy = policies.MlpPolicy
    ob_space = env.observation_space
    ac_space = env.action_space
    num_batch_ac = env.num_envs
    model = policy(sess, ob_space, ac_space, num_batch_ac, 1, reuse=False)

    with tf.variable_scope('model'):
        trained_vars = tf.trainable_variables()
        loaded_vars = joblib.load(load_path)
        restore_ops = []
        for trained, loaded in zip(trained_vars, loaded_vars):
            restore_ops.append(trained.assign(loaded))
        sess.run(restore_ops)

    observations = env.reset()
    episode_reward = 0.0
    states = model.initial_state
    dones = [False for _ in range(env.num_envs)]
    for _ in range(num_time_steps):
        env.render()
        actions, values, states, neglogpacs = model.step(
            observations, states, dones)
        observations, reward, done, info = env.step(actions)
        episode_reward += reward
        print("Episode reward: {}".format(episode_reward))