def test_ddpg_pendulum(self): """Test PPO with Pendulum environment.""" logger._tensorboard = TensorBoardOutput() env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=1e-2, n_epochs=10, n_epoch_cycles=20, max_path_length=100, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 30
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return: """ ext.set_seed(seed) with tf.Graph().as_default(): # Set up params for ddpg action_noise = OUStrategy(env, sigma=params["sigma"]) actor_net = ContinuousMLPPolicy( env_spec=env, name="Actor", hidden_sizes=params["actor_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=params["critic_hidden_sizes"], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG(env, actor=actor_net, critic=critic_net, actor_lr=params["actor_lr"], critic_lr=params["critic_lr"], plot=False, target_update_tau=params["tau"], n_epochs=params["n_epochs"], n_epoch_cycles=params["n_epoch_cycles"], n_rollout_steps=params["n_rollout_steps"], n_train_steps=params["n_train_steps"], discount=params["discount"], replay_buffer_size=params["replay_buffer_size"], min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") tensorboard_log_dir = osp.join(log_dir, "progress") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(tensorboard_log_dir) ddpg.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=0.05, n_epochs=50, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) ddpg.train()
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = gym.make('FetchReach-v1') action_noise = OUStrategy(env, sigma=0.2) actor_net = ContinuousMLPPolicy( env_spec=env, name="Actor", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) ddpg = DDPG( env, actor=actor_net, actor_lr=1e-3, critic_lr=1e-3, critic=critic_net, plot=False, target_update_tau=0.05, n_epochs=50, n_epoch_cycles=20, n_rollout_steps=100, n_train_steps=40, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer, use_her=True, batch_size=256, clip_obs=200., ) ddpg.train()
def run_task(*_): sess = tf.Session() sess.__enter__() inner_env = SimpleReacherEnv( goal_position=(0.5, 0, 0.15), control_method="position_control", completion_bonus=2., action_scale=0.04, ) latent_policy = joblib.load(latent_policy_pkl)["policy"] env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) action_noise = OUStrategy(env, sigma=0.2) actor_net = ContinuousMLPPolicy( env_spec=env.spec, name="Actor", hidden_sizes=[64, 32], hidden_nonlinearity=tf.nn.relu,) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=[64, 32], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG( env, actor=actor_net, actor_lr=1e-4, critic_lr=1e-3, critic=critic_net, plot=True, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=10, n_rollout_steps=100, n_train_steps=50, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e3), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) ddpg.train(sess=sess)
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=20, max_path_length=100, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) ddpg.train()
def run_task(*_): sess = tf.Session() sess.__enter__() latent_policy = joblib.load(latent_policy_pkl)["policy"] inner_env = SequencePointEnv(completion_bonus=100) env = TfEnv(AlmostContinuousEmbeddedPolicyEnv(inner_env, latent_policy)) action_noise = OUStrategy(env, sigma=0.8) actor_net = ContinuousMLPPolicy( env_spec=env, name="Actor", hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) critic_net = ContinuousMLPQFunction( env_spec=env, name="Critic", hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG( env, actor=actor_net, actor_lr=1e-4, critic_lr=1e-3, critic=critic_net, plot=False, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=100, n_rollout_steps=50, n_train_steps=50, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer,) ddpg.train(sess=sess)
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env, sigma=0.2) actor_net = ContinuousMLPPolicy(env_spec=env.spec, name="Actor", hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) critic_net = ContinuousMLPQFunction(env_spec=env.spec, name="Critic", hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG(env, actor=actor_net, actor_lr=1e-4, critic_lr=1e-3, critic=critic_net, plot=False, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=20, n_rollout_steps=100, n_train_steps=50, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) ddpg.train()
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return: """ ext.set_seed(seed) with tf.Graph().as_default(): env = TfEnv(env) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params["sigma"]) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=params["policy_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=params["qf_hidden_sizes"], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params["replay_buffer_size"], time_horizon=params["n_rollout_steps"]) ddpg = DDPG( env, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params["policy_lr"], qf_lr=params["qf_lr"], plot=False, target_update_tau=params["tau"], n_epochs=params["n_epochs"], n_epoch_cycles=params["n_epoch_cycles"], max_path_length=params["n_rollout_steps"], n_train_steps=params["n_train_steps"], discount=params["discount"], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") tensorboard_log_dir = osp.join(log_dir, "progress") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(tensorboard_log_dir) ddpg.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file