def test_target_params_copied(self): algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, ) target_qf = algo.target_qf target_policy = algo.target_policy qf = algo.qf qf_copy = algo.qf_with_action_input policy = algo.policy # Make sure they're different to start random_values = [ np.random.rand(*values.shape) for values in qf.get_param_values() ] qf.set_param_values(random_values) random_values = [ np.random.rand(*values.shape) for values in policy.get_param_values() ] policy.set_param_values(random_values) self.assertParamsNotEqual(target_qf, qf) self.assertParamsNotEqual(target_policy, policy) self.assertParamsEqual(qf_copy, qf) algo.train() self.assertParamsEqual(target_qf, qf) self.assertParamsEqual(target_policy, policy) self.assertParamsEqual(qf_copy, qf)
def lstm_launcher(variant): """ Run a simple LSTM on an environment. :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic(name_or_scope="critic", env_spec=env.spec, **variant.get('qf_params', {})) policy = FeedForwardPolicy(name_or_scope="actor", env_spec=env.spec, **variant.get('policy_params', {})) algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params']) algorithm.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=128, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=100, ) exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp()) algorithm = DDPG( env, es, policy, qf, **default_ddpg_params, ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=1, )
def my_ddpg_launcher(variant): """ Run DDPG :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algorithm = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm.train()
def run_task(_): from railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.gym_env import GymEnv def gym_env(name): return GymEnv(name, record_video=False, log_dir='/tmp/gym-test', # Ignore gym log. record_log=False) env = TfEnv(gym_env('AxeTwoDPoint-v0')) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, **ddpg_params ) algorithm.train()
def example(*_): env = HalfCheetahEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=25, batch_size=1024, replay_pool_size=10000, ) algorithm.train()
def run_task(_): for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=16, n_epochs=100, epoch_length=100, eval_samples=100, max_path_length=10, min_pool_size=2, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train(),
def test_target_params_update(self): tau = 0.2 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, soft_target_tau=tau, ) target_qf = algo.target_qf target_policy = algo.target_policy qf = algo.qf policy = algo.policy algo.train() orig_tc_vals = target_qf.get_param_values() orig_ta_vals = target_policy.get_param_values() orig_c_vals = qf.get_param_values() orig_a_vals = policy.get_param_values() algo.sess.run(algo.update_target_policy_op) algo.sess.run(algo.update_target_qf_op) new_tc_vals = target_qf.get_param_values() new_ta_vals = target_policy.get_param_values() for orig_tc_val, orig_c_val, new_tc_val in zip(orig_tc_vals, orig_c_vals, new_tc_vals): self.assertTrue((new_tc_val == tau * orig_c_val + (1 - tau) * orig_tc_val).all()) for orig_ta_val, orig_a_val, new_ta_val in zip(orig_ta_vals, orig_a_vals, new_ta_vals): self.assertTrue((new_ta_val == tau * orig_a_val + (1 - tau) * orig_ta_val).all())
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=32, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=1000, ) sweeper = DeterministicHyperparameterSweeper( {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, ) exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp()) for ddpg_params in sweeper.iterate_hyperparameters(): algorithm = DDPG( env, es, policy, qf, scale_reward=ddpg_params['scale_reward'], **default_ddpg_params, ) for seed in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=seed, # mode="local", # use_cloudpickle=True, )
def main(): stub(globals()) env = TfEnv(CartpoleEnv()) ddpg_params = dict( batch_size=128, n_epochs=50, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) es = OUStrategy(env_spec=env.spec) qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) for seed in range(3): env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="test-qddpg-cartpole", seed=seed, )
env, es, policy, qf, "/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format( policy_lr, qf_lr, gamma, tau, ), qf_learning_rate=qf_lr, policy_learning_rate=policy_lr, discount=gamma, soft_target_tau=tau, gpu_ratio=0.25, ) run_experiment_lite( algo.train(), exp_prefix="ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format( policy_lr, qf_lr, gamma, tau, ), n_parallel=1, snapshot_mode="last", seed=seed, mode="local" )
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_initlr', type=float, default=1e-4) parser.add_argument('--qf_initlr', type=float, default=1e-3) parser.add_argument('--qf_decay', type=float, default=0.01) parser.add_argument('--qf_soft_tau', type=float, default=1e-3) # Exploration hyperparameters parser.add_argument('--ou_theta', type=float, default=0.15) parser.add_argument('--ou_sigma', type=float, default=0.3) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=0.95) args = parser.parse_args() env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v15',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'ddpg-pixel-v15-plr{0}-qlr{1}-tau{2}-qfdecay{3}'.format( args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay) es = OUStrategy(env_spec=env.spec, theta=args.ou_theta, sigma=args.ou_sigma) # import pdb; pdb.set_trace() qf = ConvNNCritic( name_or_scope="critic", input_shape=env.observation_space.shape, env_spec=env.spec, conv_filters=(32, 32, 32, 32, 32), conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), observation_hidden_sizes=(256, ), embedded_hidden_sizes=(256, ), hidden_nonlinearity=tf.nn.relu, ) policy = ConvNNPolicy( name_or_scope="actor", input_shape=env.observation_space.shape, env_spec=env.spec, conv_filters=(32, 32, 32, 32, 32), conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), observation_hidden_sizes=(256, 128), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) algo = DDPG( env=env, exploration_strategy=es, policy=policy, qf=qf, tensorboard_path=os.path.join(args.tfboard_path, name + '_%d' % args.seed), replay_pool_size=100000, obs_dtype='uint8', qf_learning_rate=args.qf_initlr, policy_learning_rate=args.policy_initlr, soft_target_tau=args.qf_soft_tau, gpu_ratio=args.gpu_ratio, ) run_experiment_lite(algo.train(), exp_prefix=name, n_parallel=1, snapshot_mode="last", seed=args.seed, mode="local")
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_initlr', type=float, default=1e-4) parser.add_argument('--qf_initlr', type=float, default=1e-3) parser.add_argument('--qf_decay', type=float, default=.0) parser.add_argument('--qf_soft_tau', type=float, default=1e-3) # Exploration hyperparameters parser.add_argument('--ou_theta', type=float, default=0.15) parser.add_argument('--ou_sigma', type=float, default=0.3) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=1.0) args = parser.parse_args() env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format( args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay, args.ou_theta, args.ou_sigma) es = OUStrategy(env_spec=env.spec, theta=args.ou_theta, sigma=args.ou_sigma) policy = FeedForwardPolicy( name_or_scope="actor", observation_hidden_sizes=(400, 300), env_spec=env.spec, ) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), ) algo = DDPG( env=env, exploration_strategy=es, policy=policy, qf=qf, tensorboard_path=os.path.join(args.tfboard_path, name, '_%d' % args.seed), qf_learning_rate=args.qf_initlr, policy_learning_rate=args.policy_initlr, soft_target_tau=args.qf_soft_tau, gpu_ratio=args.gpu_ratio, ) run_experiment_lite(algo.train(), exp_prefix=name, n_parallel=1, snapshot_mode="last", seed=args.seed, mode="local")