def categorical_cnn_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=hyper_params['conv_filters'], conv_filter_sizes=hyper_params['conv_filter_sizes'], conv_strides=hyper_params['conv_strides'], conv_pad=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict( num_filters=hyper_params['conv_filters'], filter_dims=hyper_params['conv_filter_sizes'], strides=hyper_params['conv_strides'], padding=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes'], use_trust_region=hyper_params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), flatten_input=False, ) runner.setup(algo, env) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['batch_size'])
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, )) # yapf: disable baseline = GaussianCNNBaseline( env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True) # yapf: disable algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_episode_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_episode_length=10, learning_rate=1e-3, )) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=batch_size)
def categorical_cnn_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalCNNPolicy( env_spec=env.spec, filters=hyper_params['conv_filters'], strides=hyper_params['conv_strides'], padding=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, filters=hyper_params['conv_filters'], strides=hyper_params['conv_strides'], padding=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes'], use_trust_region=hyper_params['use_trust_region']) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['batch_size'])
def test_is_pickleable(self): env = TfEnv(DummyDiscretePixelEnv()) policy = CategoricalCNNPolicy(env_spec=env.spec, filter_dims=(32, ), num_filters=(3, ), strides=(1, ), padding='SAME', hidden_sizes=(4, )) obs_var = tf.compat.v1.placeholder(tf.float32, shape=(None, None) + env.observation_space.shape, name='obs') policy.build(obs_var) env.reset() obs, _, _, _ = env.step(1) with tf.compat.v1.variable_scope( 'CategoricalCNNPolicy/CategoricalCNNModel', reuse=True): cnn_bias = tf.compat.v1.get_variable('CNNModel/cnn/h0/bias') bias = tf.compat.v1.get_variable('MLPModel/mlp/hidden_0/bias') cnn_bias.load(tf.ones_like(cnn_bias).eval()) bias.load(tf.ones_like(bias).eval()) output1 = self.sess.run(policy.distribution.probs, feed_dict={policy.model.input: [[obs]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) obs_var = tf.compat.v1.placeholder(tf.float32, shape=(None, None) + env.observation_space.shape, name='obs') policy_pickled.build(obs_var) output2 = sess.run(policy_pickled.distribution.probs, feed_dict={policy_pickled.model.input: [[obs]]}) assert np.array_equal(output1, output2)
def test_dist_info(self, obs_dim, action_dim, filter_dims, filter_sizes, strides, padding, hidden_sizes): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'categorical_cnn_policy.MLPModel'), new=SimpleMLPModel): with mock.patch(('garage.tf.policies.' 'categorical_cnn_policy.CNNModel'), new=SimpleCNNModel): policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=filter_dims, conv_filter_sizes=filter_sizes, conv_strides=strides, conv_pad=padding, hidden_sizes=hidden_sizes) env.reset() obs, _, _, _ = env.step(1) expected_prob = np.full(action_dim, 0.5) policy_probs = policy.dist_info([obs]) assert np.array_equal(policy_probs['prob'][0], expected_prob)
def gaussian_cnn_baseline(ctxt, env_id, seed): """Create Gaussian CNN Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = normalize(GymEnv(env_id)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region']) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_episode_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_episode_length=10, learning_rate=1e-3, ), ) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['batch_size'])
def trpo_cubecrash(ctxt=None, seed=1, max_episode_length=5, batch_size=4000): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize( GymEnv('CubeCrash-v0', max_episode_length=max_episode_length)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=batch_size)
def trpo_cubecrash(ctxt=None, seed=1, batch_size=4000): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
class TestCategoricalCNNPolicyImageObs(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = GymEnv(DummyDiscretePixelEnv(), is_image=True) self.sess.run(tf.compat.v1.global_variables_initializer()) self.env.reset() @pytest.mark.parametrize('filters, strides, padding, hidden_sizes', [ (((3, (32, 32)), ), (1, ), 'VALID', (4, )), ]) def test_obs_unflattened(self, filters, strides, padding, hidden_sizes): self.policy = CategoricalCNNPolicy(env_spec=self.env.spec, filters=filters, strides=strides, padding=padding, hidden_sizes=hidden_sizes) obs = self.env.observation_space.sample() action, _ = self.policy.get_action( self.env.observation_space.flatten(obs)) self.env.step(action)
def run_task(snapshot_config, variant_data, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. variant_data (dict): Custom arguments for the task. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64, 64), conv_filter_sizes=(5, 3, 2), conv_strides=(4, 2, 1), conv_pad='VALID', hidden_sizes=(256, )) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64, 64), filter_dims=(5, 3, 2), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=variant_data['batch_size'])
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64, 64), conv_filter_sizes=(5, 3, 2), conv_strides=(4, 2, 1), conv_pad='VALID', hidden_sizes=(256, )) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64, 64), filter_dims=(5, 3, 2), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=batch_size)
def test_trpo_cnn_cubecrash(self): with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CubeCrash-v0', max_episode_length=100)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.98, max_kl_step=0.01, policy_ent_coeff=0.0) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > -1.5 env.close()
def test_get_action(self, filters, strides, padding, hidden_sizes): env = GarageEnv(DummyDiscretePixelEnv()) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=filters, strides=strides, padding=padding, hidden_sizes=hidden_sizes) obs_var = tf.compat.v1.placeholder(tf.float32, shape=(None, None) + env.observation_space.shape, name='obs') policy.build(obs_var) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs, obs, obs]) for action in actions: assert env.action_space.contains(action)
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=params['conv_filters'], conv_filter_sizes=params['conv_filter_sizes'], conv_strides=params['conv_strides'], conv_pad=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(num_filters=params['conv_filters'], filter_dims=params['conv_filter_sizes'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, flatten_input=False, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['batch_size']) dowel_logger.remove_all() return tabular_log_file