def run(bsuite_id: Text) -> Text: """Runs a PPO agent on a given bsuite environment, logging to CSV.""" def _load_env(): raw_env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) if FLAGS.verbose: raw_env = terminal_logging.wrap_environment(raw_env, log_every=True) return gym_wrapper.GymFromDMEnv(raw_env) env = dummy_vec_env.DummyVecEnv([_load_env]) ppo2.learn( env=env, network=FLAGS.network, lr=FLAGS.learning_rate, total_timesteps=FLAGS.total_timesteps, # make sure to run enough steps nsteps=FLAGS.nsteps, gamma=FLAGS.agent_discount, ) return bsuite_id
def run(): """Runs a PPO agent on a given environment.""" def _load_env(): """Loads environment.""" raw_env = rwrl.load( domain_name=FLAGS.domain_name, task_name=FLAGS.task_name, safety_spec=dict(enable=True), delay_spec=dict(enable=True, actions=20), log_output=os.path.join(FLAGS.save_path, 'log.npz'), environment_kwargs=dict( log_safety_vars=True, log_every=20, flat_observation=True)) env = GymEnv(raw_env) env = bench.Monitor(env, FLAGS.save_path) return env env = dummy_vec_env.DummyVecEnv([_load_env]) ppo2.learn( env=env, network=FLAGS.network, lr=FLAGS.learning_rate, total_timesteps=FLAGS.total_timesteps, # make sure to run enough steps nsteps=FLAGS.nsteps, gamma=FLAGS.agent_discount, )
def train(training_env, num_timesteps, seed): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=_NUM_CPUS, inter_op_parallelism_threads=_NUM_CPUS) tf.Session(config=config).__enter__() def make_env(): return bench.Monitor(training_env, logger.get_dir(), allow_early_resets=True) env = dummy_vec_env.DummyVecEnv([make_env]) env = vec_normalize.VecNormalize(env) misc_util.set_global_seeds(seed) policy = policies.MlpPolicy model = ppo2.learn(policy=policy, env=env, nsteps=4096, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=32, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps, save_interval=10) return model, env
def replay(load_path, num_time_steps, render=False): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=_NUM_CPUS, inter_op_parallelism_threads=_NUM_CPUS) tf.Session(config=config).__enter__() sess = tf.get_default_session() training_env = build_environment(render=render) env = dummy_vec_env.DummyVecEnv([lambda: training_env]) env = vec_normalize.VecNormalize(env) policy = policies.MlpPolicy ob_space = env.observation_space ac_space = env.action_space num_batch_ac = env.num_envs model = policy(sess, ob_space, ac_space, num_batch_ac, 1, reuse=False) with tf.variable_scope('model'): trained_vars = tf.trainable_variables() loaded_vars = joblib.load(load_path) restore_ops = [] for trained, loaded in zip(trained_vars, loaded_vars): restore_ops.append(trained.assign(loaded)) sess.run(restore_ops) observations = env.reset() episode_reward = 0.0 states = model.initial_state dones = [False for _ in range(env.num_envs)] for _ in range(num_time_steps): env.render() actions, values, states, neglogpacs = model.step( observations, states, dones) observations, reward, done, info = env.step(actions) episode_reward += reward print("Episode reward: {}".format(episode_reward))