def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(*_): env = normalize( OneHotMultiTaskEnv( task_env_cls=PR2ArmClockEnv, task_args=TASK_ARGS, task_kwargs=TASK_KWARGS)) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=4000, min_pool_size=10000, n_epochs=1000000000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=0.05, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, n_epoch_cycles=20)
def run_task(*_): sess = tf.Session() sess.__enter__() with LocalRunner(sess=sess) as runner: inner_env = SimpleReacherEnv( goal_position=(0.5, 0, 0.15), control_method="position_control", completion_bonus=2., action_scale=0.04, ) latent_policy = joblib.load(latent_policy_pkl)["policy"] env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) action_noise = OUStrategy(env, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Actor", hidden_sizes=[64, 32], hidden_nonlinearity=tf.nn.relu, ) qf = ContinuousMLPQFunction(env_spec=env, name="Critic", hidden_sizes=[64, 32], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, plot=True, target_update_tau=1e-2, n_epochs=500, n_train_steps=50, discount=0.9, replay_buffer=replay_buffer, min_buffer_size=int(1e3), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo, env) runner.train(n_epochs=500, plot=False, n_epoch_cycles=10)
def run_task(*_): with LocalRunner() as runner: env = SimpleReacherEnv( goal_position=(0.5, 0, 0.15), control_method="position_control", completion_bonus=2., # action_scale=0.04, ) env = TfEnv(env) action_noise = OUStrategy(env, sigma=0.05) actor_net = ContinuousMLPPolicy( env_spec=env.spec, name="Actor", hidden_sizes=[200, 100], hidden_nonlinearity=tf.nn.relu,) critic_net = ContinuousMLPQFunction( env_spec=env.spec, name="Critic", hidden_sizes=[200, 100], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG( env, policy=actor_net, policy_lr=1e-4, qf=critic_net, qf_lr=1e-3, replay_buffer=replay_buffer, target_update_tau=1e-2, max_path_length=200, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(ddpg, env) runner.train(n_epochs=500, n_epoch_cycles=10, plot=False)
def test_ddpg(self): env = TheanoEnv(CartpoleEnv()) policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def run_task(*_): """ Wrap DDPG training task in the run_task function. :param _: :return: """ env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, plot=False, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=20, max_path_length=100, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) ddpg.train()
def run_task(*_): sess = tf.Session() sess.__enter__() latent_policy = joblib.load(latent_policy_pkl)["policy"] inner_env = SequencePointEnv(completion_bonus=100) env = TfEnv(AlmostContinuousEmbeddedPolicyEnv(inner_env, latent_policy)) action_noise = OUStrategy(env, sigma=0.8) actor_net = ContinuousMLPPolicy(env_spec=env, name="Actor", hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) critic_net = ContinuousMLPQFunction(env_spec=env, name="Critic", hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG( env, actor=actor_net, actor_lr=1e-4, critic_lr=1e-3, critic=critic_net, plot=False, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=100, n_rollout_steps=50, n_train_steps=50, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer, ) ddpg.train(sess=sess)
def test_ddpg(self): env = TheanoEnv(CartpoleEnv()) policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1000), time_horizon=100) algo = DDPG( env=env, policy=policy, qf=qf, es=es, pool=replay_buffer, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, eval_samples=100, ) algo.train()
def test_ddpg_pendulum(self): """Test PPO with Pendulum environment.""" logger.reset() with LocalRunner(self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 60 env.close()
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: """ deterministic.set_seed(seed) with LocalRunner() as runner: env = TfEnv(env) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params["sigma"]) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params["policy_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params["qf_hidden_sizes"], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params["replay_buffer_size"], time_horizon=params["n_rollout_steps"]) ddpg = DDPG(env, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params["policy_lr"], qf_lr=params["qf_lr"], target_update_tau=params["tau"], n_train_steps=params["n_train_steps"], discount=params["discount"], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") tensorboard_log_dir = osp.join(log_dir) garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(tensorboard_log_dir) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params["n_rollout_steps"]) garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return: """ ext.set_seed(seed) with tf.Graph().as_default(): env = TfEnv(env) action_noise = OUStrategy(env.spec, sigma=params["sigma"]) policy = ContinuousMLPPolicy( env_spec=env.spec, name="Policy", hidden_sizes=params["policy_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name="QFunction", hidden_sizes=params["qf_hidden_sizes"], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params["replay_buffer_size"], time_horizon=params["n_rollout_steps"], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params["policy_lr"], qf_lr=params["qf_lr"], plot=False, target_update_tau=params["tau"], n_epochs=params["n_epochs"], n_epoch_cycles=params["n_epoch_cycles"], max_path_length=params["n_rollout_steps"], n_train_steps=params["n_train_steps"], discount=params["discount"], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(log_dir) algo.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file