def test_target_params_copied(self): algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, ) target_qf = algo.target_qf target_policy = algo.target_policy qf = algo.qf qf_copy = algo.qf_with_action_input policy = algo.policy # Make sure they're different to start random_values = [ np.random.rand(*values.shape) for values in qf.get_param_values() ] qf.set_param_values(random_values) random_values = [ np.random.rand(*values.shape) for values in policy.get_param_values() ] policy.set_param_values(random_values) self.assertParamsNotEqual(target_qf, qf) self.assertParamsNotEqual(target_policy, policy) self.assertParamsEqual(qf_copy, qf) algo.train() self.assertParamsEqual(target_qf, qf) self.assertParamsEqual(target_policy, policy) self.assertParamsEqual(qf_copy, qf)
def test_qf_targets(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3., 4.]) terminals = np.array([0., 0.]) obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]]) actions = np.array([[-0.5], [-0.5]]) next_obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target1 = 3 + 0.5 * Q([1,1,1,1], u([1,1,1,1])) # = 3 + 0.5 * Q([1,1,1,1], 4) # = 3 + 0.5 * 8 # = 7 # target2 = 8 feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) self.assertNpEqual(np.array([[7.], [8.]]), algo.sess.run(algo.ys, feed_dict=feed_dict))
def test_qf_targets2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3.5]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[2.]]) next_obs = np.array([[2., 2., 2., 2.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2])) # = 3.5 + 0.5 * Q([2,2,2,2], 8) # = 3.5 + 0.5 * 16 # = 11.5 feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) self.assertNpEqual(np.array([[11.5]]), algo.sess.run(algo.ys, feed_dict=feed_dict))
def test_policy_gradient(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]]) # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta # = -1/2 sum_{i=0}^1 * 1 * [1,1,1,1] # = - [1,1,1,1] feed_dict = algo._policy_feed_dict(obs) loss_grad_ops = tf.gradients(algo.policy_surrogate_loss, algo.policy.get_params_internal()) actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict) actual_loss_grads_flat = np.vstack(actual_loss_grads).flatten() expected = [ -1 * np.ones_like(v) for v in algo.policy.get_param_values() ] self.assertTrue( are_np_array_iterables_equal(actual_loss_grads_flat, expected))
def test_policy_gradient2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) obs = np.array([[1., -10., 1., 2.], [1., 100., 1., 2.]]) # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta # = -1/2 * 1 * [1,-10,1,2] # + -1/2 * 1 * [1,100,1,2] # = - [1., 45., 1., 2.] feed_dict = algo._policy_feed_dict(obs) loss_grad_ops = tf.gradients(algo.policy_surrogate_loss, algo.policy.get_params_internal()) actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict) expected = [np.array([[-1.], [-45.], [-1.], [-2.]])] self.assertTrue( are_np_array_iterables_equal(actual_loss_grads, expected))
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=128, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=100, ) exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp()) algorithm = DDPG( env, es, policy, qf, **default_ddpg_params, ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=1, )
def test_policy_surrogate_loss2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) obs = np.array([[0., 1., 1., -11.], [5., 10., 10., -10.]]) # loss = -1/N sum_i Q(s_i, u(s_i)) # = -1/2 * {(Q([0,1,1,-11], u([0,1,1,-11])) # + Q([5,10,10,-10], u([5,10,10,-10]))} # = -1/2 * {Q([0,1,1,-11], -9)) + Q([5,10,10,-10], 15))} # = -1/2 * (-18 + 30) # = -6 feed_dict = algo._policy_feed_dict(obs) actual = algo.sess.run(algo.policy_surrogate_loss, feed_dict=feed_dict) self.assertEqual(actual, -6.) self.assertEqual(np.float32, type(actual))
def lstm_launcher(variant): """ Run a simple LSTM on an environment. :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic(name_or_scope="critic", env_spec=env.spec, **variant.get('qf_params', {})) policy = FeedForwardPolicy(name_or_scope="actor", env_spec=env.spec, **variant.get('policy_params', {})) algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params']) algorithm.train()
def test_only_qf_values_change(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) old_qf_values = algo.qf.get_param_values() old_qf_copy_values = (algo.qf_with_action_input.get_param_values()) old_policy_values = algo.policy.get_param_values() old_target_qf_values = algo.target_qf.get_param_values() old_target_policy_values = algo.target_policy.get_param_values() rewards = np.array([3.]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[-0.5]]) next_obs = np.array([[1., 1., 1., 1.]]) feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) algo.sess.run(algo.train_qf_op, feed_dict=feed_dict) new_qf_values = algo.qf.get_param_values() new_qf_copy_values = (algo.qf_with_action_input.get_param_values()) new_policy_values = algo.policy.get_param_values() new_target_qf_values = algo.target_qf.get_param_values() new_target_policy_values = algo.target_policy.get_param_values() self.assertTrue( are_np_array_iterables_equal(old_policy_values, new_policy_values)) self.assertFalse( are_np_array_iterables_equal(old_qf_values, new_qf_values)) self.assertFalse( are_np_array_iterables_equal(old_qf_copy_values, new_qf_copy_values)) self.assertTrue( are_np_array_iterables_equal(old_target_policy_values, new_target_policy_values)) self.assertTrue( are_np_array_iterables_equal(old_target_qf_values, new_target_qf_values)) self.assertParamsEqual(algo.qf_with_action_input, algo.qf)
def test_target_params_hard_update(self): tau = 1 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, soft_target_tau=tau, ) target_qf = algo.target_qf target_policy = algo.target_policy qf = algo.qf policy = algo.policy random_values = [ np.random.rand(*values.shape) for values in qf.get_param_values() ] qf.set_param_values(random_values) random_values = [ np.random.rand(*values.shape) for values in policy.get_param_values() ] policy.set_param_values(random_values) self.assertParamsNotEqual(target_qf, qf) self.assertParamsNotEqual(target_policy, policy) algo.sess.run(algo.update_target_policy_op) algo.sess.run(algo.update_target_qf_op) self.assertParamsEqual(target_qf, qf) self.assertParamsEqual(target_policy, policy)
def run_task(_): from railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.gym_env import GymEnv def gym_env(name): return GymEnv(name, record_video=False, log_dir='/tmp/gym-test', # Ignore gym log. record_log=False) env = TfEnv(gym_env('AxeTwoDPoint-v0')) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, **ddpg_params ) algorithm.train()
def my_ddpg_launcher(variant): """ Run DDPG :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algorithm = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm.train()
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) for seed in range(3): ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.0, ) vitchyr_es = OUStrategy(env_spec=env.spec) vitchyr_qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) vitchyr_policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf, **ddpg_params) shane_es = GaussianStrategy(env.spec) shane_policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) shane_qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es, **ddpg_params) names_and_algos = [ ("Vitchyr_DDPG", vitchyr_ddpg), ("Shane_DDPG", shane_ddpg), ] for name, algorithm in names_and_algos: env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-comparison-cheetah", seed=seed, )
def test_qf_gradient(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3.5]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[1.]]) next_obs = np.array([[2., 2., 2., 2.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2])) # = 3.5 + 0.5 * Q([2,2,2,2], 8) # = 3.5 + 0.5 * 16 # = 11.5 # # dloss/dtheta = - 2 ( y - qf(obs, action)) * # d/dtheta (qf(obs, action)) # dloss/dtheta = - 2 ( y - qf([1,1,1,1], 1)) * # d/dtheta (qf(obs, action)) # dloss/dtheta = - 2 ( 11.5 - 5) * # d/dtheta (qf(obs, action)) # dloss/dtheta = - 13 * d/dtheta (qf(obs, action)) feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) grads = tf.gradients(algo.qf_loss, algo.qf.get_params_internal()) # qf_grads = algo.sess.run( # tf.gradients(algo.qf.output, algo.qf.get_vars())) expected = [-13. * np.ones_like(v) for v in algo.qf.get_param_values()] actual = algo.sess.run(grads, feed_dict=feed_dict) actual_flat = np.vstack(actual).flatten() self.assertTrue(are_np_array_iterables_equal(expected, actual_flat), "Numpy arrays not equal")
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=32, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=1000, ) sweeper = DeterministicHyperparameterSweeper( {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, ) exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp()) for ddpg_params in sweeper.iterate_hyperparameters(): algorithm = DDPG( env, es, policy, qf, scale_reward=ddpg_params['scale_reward'], **default_ddpg_params, ) for seed in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=seed, # mode="local", # use_cloudpickle=True, )
def example(*_): env = HalfCheetahEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=25, batch_size=1024, replay_pool_size=10000, ) algorithm.train()
def main(): stub(globals()) env = TfEnv(CartpoleEnv()) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) for seed in range(3): env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="test-qddpg-cartpole", seed=seed, )
def run_task(_): for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=16, n_epochs=100, epoch_length=100, eval_samples=100, max_path_length=10, min_pool_size=2, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train(),
def test_target_params_update(self): tau = 0.2 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, soft_target_tau=tau, ) target_qf = algo.target_qf target_policy = algo.target_policy qf = algo.qf policy = algo.policy algo.train() orig_tc_vals = target_qf.get_param_values() orig_ta_vals = target_policy.get_param_values() orig_c_vals = qf.get_param_values() orig_a_vals = policy.get_param_values() algo.sess.run(algo.update_target_policy_op) algo.sess.run(algo.update_target_qf_op) new_tc_vals = target_qf.get_param_values() new_ta_vals = target_policy.get_param_values() for orig_tc_val, orig_c_val, new_tc_val in zip(orig_tc_vals, orig_c_vals, new_tc_vals): self.assertTrue((new_tc_val == tau * orig_c_val + (1 - tau) * orig_tc_val).all()) for orig_ta_val, orig_a_val, new_ta_val in zip(orig_ta_vals, orig_a_vals, new_ta_vals): self.assertTrue((new_ta_val == tau * orig_a_val + (1 - tau) * orig_ta_val).all())
def test_qf_loss2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3.5]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[2.]]) next_obs = np.array([[2., 2., 2., 2.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2])) # = 3.5 + 0.5 * Q([2,2,2,2], 8) # = 3.5 + 0.5 * 16 # = 11.5 # # loss = (target - qf(obs, action))^2 # = (target - qf([1,1,1,1], 2))^2 # = (target - 6)^2 # = (11.5 - 6)^2 # = (5.5)^2 # = 30.25 feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) actual = algo.sess.run(algo.qf_loss, feed_dict=feed_dict) self.assertEqual(30.25, actual) self.assertEqual(np.float32, type(actual))
def test_sum_policy(self): algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. ) obs = np.array([[1., 1., 1., 1.]]) for policy in [algo.policy, algo.target_policy]: feed_dict = { policy.observation_input: obs, } self.assertEqual(np.sum(obs), algo.sess.run(policy.output, feed_dict=feed_dict))
def test_sum_qf(self): algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. ) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[-0.5]]) for qf in [algo.qf, algo.target_qf]: feed_dict = { qf.action_input: actions, qf.observation_input: obs, } self.assertEqual( np.sum(obs) + actions, algo.sess.run(qf.output, feed_dict=feed_dict))
def test_target_params_no_update(self): tau = 0 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, soft_target_tau=tau, ) target_qf = algo.target_qf target_policy = algo.target_policy qf = algo.qf policy = algo.policy random_values = [ np.random.rand(*values.shape) for values in qf.get_param_values() ] qf.set_param_values(random_values) random_values = [ np.random.rand(*values.shape) for values in policy.get_param_values() ] policy.set_param_values(random_values) old_target_qf_values = target_qf.get_param_values() old_target_policy_values = target_policy.get_param_values() self.assertParamsNotEqual(target_qf, qf) self.assertParamsNotEqual(target_policy, policy) algo.sess.run(algo.update_target_policy_op) algo.sess.run(algo.update_target_qf_op) self.assertTrue( are_np_array_iterables_equal(old_target_qf_values, target_qf.get_param_values())) self.assertTrue( are_np_array_iterables_equal(old_target_policy_values, target_policy.get_param_values())) self.assertParamsNotEqual(target_qf, qf) self.assertParamsNotEqual(target_policy, policy)
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_initlr', type=float, default=1e-4) parser.add_argument('--qf_initlr', type=float, default=1e-3) parser.add_argument('--qf_decay', type=float, default=.0) parser.add_argument('--qf_soft_tau', type=float, default=1e-3) # Exploration hyperparameters parser.add_argument('--ou_theta', type=float, default=0.15) parser.add_argument('--ou_sigma', type=float, default=0.3) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=1.0) args = parser.parse_args() env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format( args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay, args.ou_theta, args.ou_sigma) es = OUStrategy(env_spec=env.spec, theta=args.ou_theta, sigma=args.ou_sigma) policy = FeedForwardPolicy( name_or_scope="actor", observation_hidden_sizes=(400, 300), env_spec=env.spec, ) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), ) algo = DDPG( env=env, exploration_strategy=es, policy=policy, qf=qf, tensorboard_path=os.path.join(args.tfboard_path, name, '_%d' % args.seed), qf_learning_rate=args.qf_initlr, policy_learning_rate=args.policy_initlr, soft_target_tau=args.qf_soft_tau, gpu_ratio=args.gpu_ratio, ) run_experiment_lite(algo.train(), exp_prefix=name, n_parallel=1, snapshot_mode="last", seed=args.seed, mode="local")
) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) algo = DDPG( env, es, policy, qf, "/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format( policy_lr, qf_lr, gamma, tau, ), qf_learning_rate=qf_lr, policy_learning_rate=policy_lr, discount=gamma, soft_target_tau=tau, gpu_ratio=0.25, ) run_experiment_lite( algo.train(), exp_prefix="ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format( policy_lr, qf_lr, gamma,
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_initlr', type=float, default=1e-4) parser.add_argument('--qf_initlr', type=float, default=1e-3) parser.add_argument('--qf_decay', type=float, default=0.01) parser.add_argument('--qf_soft_tau', type=float, default=1e-3) # Exploration hyperparameters parser.add_argument('--ou_theta', type=float, default=0.15) parser.add_argument('--ou_sigma', type=float, default=0.3) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=0.95) args = parser.parse_args() env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v15',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'ddpg-pixel-v15-plr{0}-qlr{1}-tau{2}-qfdecay{3}'.format( args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay) es = OUStrategy(env_spec=env.spec, theta=args.ou_theta, sigma=args.ou_sigma) # import pdb; pdb.set_trace() qf = ConvNNCritic( name_or_scope="critic", input_shape=env.observation_space.shape, env_spec=env.spec, conv_filters=(32, 32, 32, 32, 32), conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), observation_hidden_sizes=(256, ), embedded_hidden_sizes=(256, ), hidden_nonlinearity=tf.nn.relu, ) policy = ConvNNPolicy( name_or_scope="actor", input_shape=env.observation_space.shape, env_spec=env.spec, conv_filters=(32, 32, 32, 32, 32), conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), observation_hidden_sizes=(256, 128), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) algo = DDPG( env=env, exploration_strategy=es, policy=policy, qf=qf, tensorboard_path=os.path.join(args.tfboard_path, name + '_%d' % args.seed), replay_pool_size=100000, obs_dtype='uint8', qf_learning_rate=args.qf_initlr, policy_learning_rate=args.policy_initlr, soft_target_tau=args.qf_soft_tau, gpu_ratio=args.gpu_ratio, ) run_experiment_lite(algo.train(), exp_prefix=name, n_parallel=1, snapshot_mode="last", seed=args.seed, mode="local")