def test_copy(self): action_dim = 5 obs_dim = 7 critic1 = FeedForwardCritic(name_or_scope="qf_a", observation_dim=obs_dim, action_dim=action_dim) critic2 = FeedForwardCritic(name_or_scope="qf_b", observation_dim=obs_dim, action_dim=action_dim) critic1.sess = self.sess critic2.sess = self.sess a = np.random.rand(1, action_dim) o = np.random.rand(1, obs_dim) feed_1 = { critic1.action_input: a, critic1.observation_input: o, } feed_2 = { critic2.action_input: a, critic2.observation_input: o, } self.sess.run(tf.global_variables_initializer()) out1 = self.sess.run(critic1.output, feed_1) out2 = self.sess.run(critic2.output, feed_2) self.assertFalse((out1 == out2).all()) critic2.set_param_values(critic1.get_param_values()) out1 = self.sess.run(critic1.output, feed_1) out2 = self.sess.run(critic2.output, feed_2) self.assertTrue((out1 == out2).all())
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=128, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=100, ) exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp()) algorithm = DDPG( env, es, policy, qf, **default_ddpg_params, ) run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=1, )
def lstm_launcher(variant): """ Run a simple LSTM on an environment. :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic(name_or_scope="critic", env_spec=env.spec, **variant.get('qf_params', {})) policy = FeedForwardPolicy(name_or_scope="actor", env_spec=env.spec, **variant.get('policy_params', {})) algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params']) algorithm.train()
def test_serialize_feedforward_critic(self): f = FeedForwardCritic( name_or_scope="a", action_dim=self.action_dim, observation_dim=self.observation_dim, ) self.sess.run(tf.global_variables_initializer()) pickle.dumps(f)
def main(): stub(globals()) env = TfEnv(HalfCheetahEnv()) for seed in range(3): ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=10000, eval_samples=10000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.0, ) vitchyr_es = OUStrategy(env_spec=env.spec) vitchyr_qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) vitchyr_policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf, **ddpg_params) shane_es = GaussianStrategy(env.spec) shane_policy = DeterministicMLPPolicy( name="init_policy", env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, ) shane_qf = ContinuousMLPQFunction(name="qf", env_spec=env.spec, hidden_sizes=(100, 100)) shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es, **ddpg_params) names_and_algos = [ ("Vitchyr_DDPG", vitchyr_ddpg), ("Shane_DDPG", shane_ddpg), ] for name, algorithm in names_and_algos: env.reset() run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix="ddpg-comparison-cheetah", seed=seed, )
def run_linear_ocm_exp(variant): from railrl.tf.ddpg import DDPG from railrl.envs.flattened_product_box import FlattenedProductBox from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.tf.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented ) from railrl.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ seed = variant['seed'] algo_params = variant['algo_params'] env_class = variant['env_class'] env_params = variant['env_params'] memory_dim = variant['memory_dim'] ou_params = variant['ou_params'] set_seed(seed) """ Code for running the experiment. """ env = env_class(**env_params) env = ContinuousMemoryAugmented( env, num_memory_states=memory_dim, ) env = FlattenedProductBox(env) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="policy", env_spec=env.spec, ) es = OUStrategy( env_spec=env.spec, **ou_params ) algorithm = DDPG( env, es, policy, qf, **algo_params ) algorithm.train()
def test_output_len(self): action_dim = 5 obs_dim = 7 critic = FeedForwardCritic(name_or_scope="1", observation_dim=obs_dim, action_dim=action_dim) critic.sess = self.sess a = np.random.rand(1, action_dim) o = np.random.rand(1, obs_dim) feed = { critic.action_input: a, critic.observation_input: o, } self.sess.run(tf.global_variables_initializer()) out = self.sess.run(critic.output, feed) self.assertEqual(1, out.size)
def run_task(variant): import tensorflow as tf from railrl.railrl.algos.ddpg import DDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF from rllab.exploration_strategies.ou_strategy import OUStrategy from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.box2d.cartpole_env import CartpoleEnv env = TfEnv(CartpoleEnv()) algo_name = variant['Algorithm'] if algo_name == 'Quadratic-DDPG': qf = QuadraticNAF( name_or_scope="quadratic_qf", env_spec=env.spec, ) elif algo_name == 'DDPG': qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), hidden_nonlinearity=tf.nn.relu, ) else: raise Exception('Algo name not recognized: {0}'.format(algo_name)) es = OUStrategy(env_spec=env.spec) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=128, n_epochs=100, epoch_length=1000, eval_samples=1000, discount=0.99, policy_learning_rate=1e-4, qf_learning_rate=1e-3, soft_target_tau=0.01, replay_pool_size=1000000, min_pool_size=256, scale_reward=1.0, max_path_length=1000, qf_weight_decay=0.01, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train()
def my_ddpg_launcher(variant): """ Run DDPG :param variant: Dictionary of dictionary with the following keys: - algo_params - env_params - qf_params - policy_params :return: """ from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algorithm = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm.train()
def example(variant): load_policy_file = variant.get('load_policy_file', None) if load_policy_file is not None and exists(load_policy_file): with tf.Session(): data = joblib.load(load_policy_file) print(data) policy = data['policy'] qf = data['qf'] replay_buffer = data['pool'] env = HalfCheetahEnv() es = OUStrategy(action_space=env.action_space) use_new_version = variant['use_new_version'] algorithm = DDPG( env, es, policy, qf, n_epochs=2, batch_size=1024, replay_pool=replay_buffer, use_new_version=use_new_version, ) algorithm.train() else: env = HalfCheetahEnv() es = OUStrategy(action_space=env.action_space) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) use_new_version = variant['use_new_version'] algorithm = DDPG( env, es, policy, qf, n_epochs=2, batch_size=1024, use_new_version=use_new_version, ) algorithm.train()
def main(): env = TfEnv(CartpoleEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) default_ddpg_params = dict( batch_size=32, n_epochs=10, epoch_length=1000, eval_samples=1000, max_path_length=100, min_pool_size=1000, ) sweeper = DeterministicHyperparameterSweeper( {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, ) exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp()) for ddpg_params in sweeper.iterate_hyperparameters(): algorithm = DDPG( env, es, policy, qf, scale_reward=ddpg_params['scale_reward'], **default_ddpg_params, ) for seed in range(3): run_experiment_lite( algorithm.train(), n_parallel=1, snapshot_mode="last", exp_prefix=exp_prefix, seed=seed, # mode="local", # use_cloudpickle=True, )
def example(*_): env = DoublePendulumEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=30, batch_size=1024, ) algorithm.train()
def example(*_): env = HalfCheetahEnv() es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, n_epochs=25, batch_size=1024, replay_pool_size=10000, ) algorithm.train()
def example(variant): env_settings = get_env_settings( **variant['env_params'] ) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) algorithm = DDPG( env, es, policy, qf, **variant['ddpg_params'] ) algorithm.train()
def run_task(_): for seed in range(3): env = TfEnv(HalfCheetahEnv()) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) ddpg_params = dict( batch_size=16, n_epochs=100, epoch_length=100, eval_samples=100, max_path_length=10, min_pool_size=2, ) algorithm = DDPG(env, es, policy, qf, **ddpg_params) algorithm.train(),
def run_linear_ocm_exp(variant): from railrl.tf.ddpg import DDPG from railrl.launchers.launcher_util import ( set_seed, ) from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.tf.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] algo_params = variant['algo_params'] env_class = variant['env_class'] env_params = variant['env_params'] ou_params = variant['ou_params'] set_seed(seed) """ Code for running the experiment. """ env = env_class(**env_params) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="policy", env_spec=env.spec, ) es = OUStrategy(env_spec=env.spec, **ou_params) algorithm = DDPG(env, es, policy, qf, **algo_params) algorithm.train()
# Param ranges seed = 3 policy_lrs = [1e-5, 1e-4, 1e-3] qf_lrs = [1e-5, 1e-4, 1e-3] gammas = [0.9, 0.99, 0.995] taus = [1e-3, 1e-2] for policy_lr, qf_lr, gamma, tau in itertools.product(policy_lrs, qf_lrs, gammas, taus): env = TfEnv(normalize(env=GymEnv('Box3dReach-v4',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) es = OUStrategy(env_spec=env.spec) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) algo = DDPG( env, es, policy, qf, "/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format( policy_lr,
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_initlr', type=float, default=1e-4) parser.add_argument('--qf_initlr', type=float, default=1e-3) parser.add_argument('--qf_decay', type=float, default=.0) parser.add_argument('--qf_soft_tau', type=float, default=1e-3) # Exploration hyperparameters parser.add_argument('--ou_theta', type=float, default=0.15) parser.add_argument('--ou_sigma', type=float, default=0.3) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=1.0) args = parser.parse_args() env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format( args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay, args.ou_theta, args.ou_sigma) es = OUStrategy(env_spec=env.spec, theta=args.ou_theta, sigma=args.ou_sigma) policy = FeedForwardPolicy( name_or_scope="actor", observation_hidden_sizes=(400, 300), env_spec=env.spec, ) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, embedded_hidden_sizes=(100, ), observation_hidden_sizes=(100, ), ) algo = DDPG( env=env, exploration_strategy=es, policy=policy, qf=qf, tensorboard_path=os.path.join(args.tfboard_path, name, '_%d' % args.seed), qf_learning_rate=args.qf_initlr, policy_learning_rate=args.policy_initlr, soft_target_tau=args.qf_soft_tau, gpu_ratio=args.gpu_ratio, ) run_experiment_lite(algo.train(), exp_prefix=name, n_parallel=1, snapshot_mode="last", seed=args.seed, mode="local")
def icm_launcher(variant): if variant["Algorithm"] == "DDPG": from railrl.algos.ddpg import DDPG as MyDDPG from railrl.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from rllab.exploration_strategies.ou_strategy import OUStrategy from railrl.exploration_strategies.simple_gaussian_strategy import SimpleGaussianStrategy from railrl.launchers.launcher_util import get_env_settings from railrl.core.tf_util import BatchNormConfig from railrl.algos.icm import ICM if ('batch_norm_params' in variant and variant['batch_norm_params'] is not None): bn_config = BatchNormConfig(**variant['batch_norm_params']) else: bn_config = None env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] es = OUStrategy(env_spec=env.spec) # es = SimpleGaussianStrategy(env_spec=env.spec, sigma=0.5) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('qf_params', {}) ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, batch_norm_config=bn_config, **variant.get('policy_params', {}) ) algo = MyDDPG( env, es, policy, qf, variant['tensorboard'], batch_norm_config=bn_config, **variant['algo_params'], ) algorithm = ICM( env, algo, no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.9, external_reward_weight=0.95, inverse_tanh=True, init_learning_rate=1e-3 ) algorithm.train() elif variant["Algorithm"] == "Idle": from railrl.algos.idle import IdleAlgo from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm import ICM env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] algo = IdleAlgo(env, variant['tensorboard']) algorithm = ICM( env, algo, no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.9, external_reward_weight=0.0, inverse_tanh=True, init_learning_rate=1e-3, ) algorithm.train() elif variant["Algorithm"] == "rllab-TRPO": from rllab.algos.trpo import TRPO from railrl.launchers.launcher_util import get_env_settings from railrl.algos.icm_trpo import ICM from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from railrl.algos.icm_trpo import ICM import lasagne.nonlinearities as NL env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=NL.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) batch_size = 5000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( env, algo, variant['tensorboard'], no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4, ) algorithm.train() elif variant["Algorithm"] == 'tf-TRPO': from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.gaussian_conv_policy import GaussianConvPolicy from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.envs.base import TfEnv from railrl.launchers.launcher_util import get_env_settings # from railrl.algos.icm_trpo_tf import ICM from railrl.algos.icm_trpo_tf_box3d import ICM import tensorflow as tf env_settings = get_env_settings(**variant['env_params']) env = TfEnv(env_settings['env']) if len(env.observation_space.shape) == 1: policy = GaussianMLPPolicy( "mlp_policy", env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) elif len(env.observation_space.shape) == 2: policy = ConvNNPolicy( "conv_policy", env_spec=mdp.spec, conv_filters=(32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)), conv_strides=(2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), ) baseline = GaussianConvBaseline( mdp.spec, regressor_args={ 'conv_filters':(32, 32, 32, 32), 'conv_filter_sizes':((3,3),(3,3),(3,3),(3,3)), 'conv_strides':(2, 2, 2, 2), 'conv_pads':('SAME', 'SAME', 'SAME', 'SAME'), 'hidden_sizes':(256,), } ) else: raise NotImplementedError("Sorry, no support for observatin space: {}".format(env.observation_space.shape)) batch_size = 5000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm = ICM( env, algo, variant['tensorboard'], no_encoder=False, feature_dim=env.spec.observation_space.flat_dim, forward_weight=0.2, external_reward_weight=0.99, inverse_tanh=True, init_learning_rate=1e-4 ) algorithm.train() else: raise NotImplementedError("Currently only supports DDPG!")