def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e5), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=params['steps_per_epoch'], policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def her_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow HER model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) action_noise = OUStrategy(env.spec, sigma=hyper_parameters['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_parameters['replay_buffer_size'], time_horizon=hyper_parameters['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_rollout_steps'])
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env_name = "HalfCheetah-v3" #env_name = "Swimmer-v2" #env_name = "InvertedDoublePendulum-v2" env = TfEnv(gym.make(env_name)) action_noise = OUStrategy(env.spec, sigma=0.2) #action_noise = GaussianStrategy(env.spec) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) reward_model = ContinuousMLPRewardFunction( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, action_merge_layer=0) obs_model = ContinuousMLPObsFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, action_merge_layer=0) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) jole_ddpg = JoLeDDPG(env_spec=env.spec, env_name=env_name, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, reward_model=reward_model, obs_model=obs_model, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, lambda_ratio=0.01, jole_obs_action_type="random_sample") runner.setup(algo=jole_ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def her_ddpg_fetchreach(ctxt=None, seed=1): """Train DDPG + HER on the goal-conditioned FetchReach env. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, buffer_batch_size=256, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def test_no_reset(self): with LocalRunner(sess=self.sess) as runner: # This tests if off-policy sampler respect batch_size # when no_reset is set to True env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True) sampler.start_worker() runner.initialize_tf_vars() paths1 = sampler.obtain_samples(0, 5) paths2 = sampler.obtain_samples(0, 5) len1 = sum([len(path['rewards']) for path in paths1]) len2 = sum([len(path['rewards']) for path in paths2]) assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size' # yapf: disable assert (len(paths1[0]['rewards']) + len(paths2[0]['rewards']) == paths2[0]['running_length']), ( 'Running length should be the length of full path') # yapf: enable assert np.isclose( paths1[0]['rewards'].sum() + paths2[0]['rewards'].sum(), paths2[0]['undiscounted_return'] ), 'Undiscounted_return should be the sum of rewards of full path'
def continuous_mlp_policy(ctxt, env_id, seed): """Create Continuous MLP Policy on TF-DDPG. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = TfEnv(normalize(gym.make(env_id))) action_noise = OUStrategy(env.spec, sigma=hyper_params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=hyper_params['replay_buffer_size'], time_horizon=hyper_params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=hyper_params['steps_per_epoch'], policy_lr=hyper_params['policy_lr'], qf_lr=hyper_params['qf_lr'], target_update_tau=hyper_params['tau'], n_train_steps=hyper_params['n_train_steps'], discount=hyper_params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(ddpg, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('HalfCheetah-v3')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) reward_model = ContinuousMLPRewardFunction( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) obs_model = ContinuousMLPObsFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ima_replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) dyna_ddpg = DynaDDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, reward_model=reward_model, obs_model=obs_model, replay_buffer=replay_buffer, ima_replay_buffer=ima_replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo=dyna_ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, n_epoch_cycles=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, n_epoch_cycles=20)
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. lr (float): Learning rate for policy optimization. """ set_seed(seed) runner = LocalRunner(ctxt) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def ddpg_pendulum(ctxt=None, seed=1): """Train DDPG with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': 1e-4, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def test_ddpg_pendulum(self): """ Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ runner = LocalRunner() env = GarageEnv(normalize(gym.make('InvertedPendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, policy_lr=1e-4, qf_lr=1e-3, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 10 env.close()
def run_task(snapshot_config, *_): with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicyWithModel(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task.""" runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, policy_lr=1e-4, qf_lr=1e-3, discount=0.9, optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) env.reset() with LocalRunner() as runner: env = TfEnv(normalize(env)) action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') logger.add_output(dowel.StdOutput()) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) logger.remove_all() return tabular_log_file
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('HalfCheetah-v3')) env = ReverseAction(env) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu) reward_model_generator = CVAERewardGenerator( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, z_dim=1) reward_model_recognition = CVAERewardRecognition( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, z_dim=1) obs_model_generator = CVAEObsGenerator(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, z_dim=1) obs_model_recognition = CVAEObsRecognition( env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, z_dim=1) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) jole_ddpg = JoLeDDPGStochastic( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, reward_model_generator=reward_model_generator, reward_model_recognition=reward_model_recognition, obs_model_generator=obs_model_generator, obs_model_recognition=obs_model_recognition, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, z_dim=1) runner.setup(algo=jole_ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
def run_garage(env, seed, log_dir): """Create garage model and training. Replace the ddpg with the algorithm you want to run. Args: env (gym.Env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: Log file path. """ deterministic.set_seed(seed) config = tf.compat.v1.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.compat.v1.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, name='ContinuousMLPPolicy', hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, name='ContinuousMLPQFunction') replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=params['steps_per_epoch'], policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(ddpg, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=params['n_epochs'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file