def test_auxiliary(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 2, 2 env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) obs_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, 2)) task_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, 2)) policy.build(obs_input, task_input) assert policy.distribution.loc.get_shape().as_list( )[-1] == env.action_space.flat_dim assert policy.encoder == encoder assert policy.latent_space.flat_dim == latent_dim assert policy.task_space.flat_dim == task_num assert (policy.augmented_observation_space.flat_dim == env.observation_space.flat_dim + task_num) assert policy.encoder_distribution.loc.get_shape().as_list( )[-1] == latent_dim
def test_get_action(self, obs_dim, task_num, latent_dim, action_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) env.reset() obs, _, _, _ = env.step(1) latent = np.random.random((latent_dim, )) task = np.zeros(task_num) task[0] = 1 action1, _ = policy.get_action_given_latent(obs, latent) action2, _ = policy.get_action_given_task(obs, task) action3, _ = policy.get_action(np.concatenate([obs.flatten(), task])) assert env.action_space.contains(action1) assert env.action_space.contains(action2) assert env.action_space.contains(action3) obses, latents, tasks = [obs] * 3, [latent] * 3, [task] * 3 aug_obses = [np.concatenate([obs.flatten(), task])] * 3 action1n, _ = policy.get_actions_given_latents(obses, latents) action2n, _ = policy.get_actions_given_tasks(obses, tasks) action3n, _ = policy.get_actions(aug_obses) for action in chain(action1n, action2n, action3n): assert env.action_space.contains(action)
def test_get_latent(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) task_id = 3 task_onehot = np.zeros(task_num) task_onehot[task_id] = 1 latent, latent_info = policy.get_latent(task_onehot) assert latent.shape == (latent_dim, ) assert latent_info['mean'].shape == (latent_dim, ) assert latent_info['log_std'].shape == (latent_dim, )
def test_get_vars(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec, hidden_sizes=[32, 32, 32]) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder, hidden_sizes=[32, 32, 32]) vars1 = sorted(policy.get_trainable_vars(), key=lambda v: v.name) vars2 = sorted(policy.get_global_vars(), key=lambda v: v.name) assert vars1 == vars2 # Two network. Each with 4 layers * (1 weight + 1 bias) + 1 log_std assert len(vars1) == 2 * (4 * 2 + 1) obs = np.random.random(obs_dim) latent = np.random.random((latent_dim, )) for var in vars1: var.assign(np.ones(var.shape)) assert np.any(policy.get_action_given_latent(obs, latent) != 0) for var in vars1: var.assign(np.zeros(var.shape)) assert not np.all(policy.get_action_given_latent(obs, latent) == 0)
def test_encoder_dist_info(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch( 'garage.tf.embeddings.' 'gaussian_mlp_encoder.GaussianMLPModel', new=SimpleGaussianMLPModel): old_build = SimpleGaussianMLPModel._build def float32_build(this, obs_input, name): mean, log_std, std, dist = old_build(this, obs_input, name) return mean, tf.cast(log_std, tf.float32), std, dist SimpleGaussianMLPModel._build = float32_build embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) assert policy.encoder_distribution.dim == latent_dim inp_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 5)) dist_sym = policy.encoder_dist_info_sym(inp_ph) dist = self.sess.run(dist_sym, feed_dict={inp_ph: [np.random.random(5)]}) expected_mean = np.full(latent_dim, 0.5) expected_log_std = np.full(latent_dim, np.log(0.5)) assert np.allclose(dist['mean'], expected_mean) assert np.allclose(dist['log_std'], expected_log_std) SimpleGaussianMLPModel._dtype = np.float32
def test_get_latent(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch( 'garage.tf.policies.' 'gaussian_mlp_task_embedding_policy.GaussianMLPModel', new=SimpleGaussianMLPModel): embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) task_id = 3 task_onehot = np.zeros(task_num) task_onehot[task_id] = 1 latent, latent_info = policy.get_latent(task_onehot) assert latent.shape == (latent_dim, ) assert latent_info['mean'].shape == (latent_dim, ) assert latent_info['log_std'].shape == (latent_dim, )
def test_pickling(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2 env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) pickled = pickle.dumps(policy) with tf.compat.v1.variable_scope('resumed'): unpickled = pickle.loads(pickled) assert hasattr(unpickled, '_f_dist_obs_latent') assert hasattr(unpickled, '_f_dist_obs_task')
def test_auxiliary(self): obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 2, 2 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch( 'garage.tf.policies.' 'gaussian_mlp_task_embedding_policy.GaussianMLPModel', new=SimpleGaussianMLPModel): embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) assert policy.distribution.dim == env.action_space.flat_dim assert policy.encoder == encoder assert policy.latent_space.flat_dim == latent_dim assert policy.task_space.flat_dim == task_num assert (policy.augmented_observation_space.flat_dim == env.observation_space.flat_dim + task_num) assert policy.encoder_distribution.dim == latent_dim
def te_ppo_mt1_push(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env), add_env_onehot=False) envs = [env_up() for env_up in task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * n_tasks policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with TFTrainer(snapshot_config=ctxt) as trainer: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def setup_method(self): super().setup_method() def circle(r, n): """Generate n points on a circle of radius r. Args: r (float): Radius of the circle. n (int): Number of points to generate. Yields: tuple(float, float): Coordinate of a point. """ for t in np.arange(0, 2 * np.pi, 2 * np.pi / n): yield r * np.sin(t), r * np.cos(t) N = 4 goals = circle(3.0, N) tasks = { str(i + 1): { 'args': [], 'kwargs': { 'goal': g, 'never_done': False, 'done_bonus': 0.0, } } for i, g in enumerate(goals) } latent_length = 1 inference_window = 2 self.batch_size = 100 * len(tasks) self.policy_ent_coeff = 2e-2 self.encoder_ent_coeff = 2.2e-3 self.inference_ce_coeff = 5e-2 self.max_path_length = 100 embedding_init_std = 1.0 embedding_max_std = 2.0 embedding_min_std = 0.38 policy_init_std = 1.0 policy_max_std = None policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] task_envs = [ GarageEnv(PointEnv(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] self.env = env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') latent_lb = np.zeros(latent_length, ) latent_ub = np.ones(latent_length, ) latent_space = akro.Box(latent_lb, latent_ub) obs_lb, obs_ub = env.observation_space.bounds obs_lb_flat = env.observation_space.flatten(obs_lb) obs_ub_flat = env.observation_space.flatten(obs_ub) traj_lb = np.stack([obs_lb_flat] * inference_window) traj_ub = np.stack([obs_ub_flat] * inference_window) traj_space = akro.Box(traj_lb, traj_ub) task_embed_spec = InOutSpec(env.task_space, latent_space) traj_embed_spec = InOutSpec(traj_space, latent_space) self.inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=[20, 10], std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=[20, 20], std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) self.policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=[32, 16], std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) self.baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents'])
def te_ppo_mt50(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = MT50.get_train_tasks().all_task_names envs = [ normalize(GymEnv(MT50.from_task(task), max_episode_length=150)) for task in tasks ] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='del-onehot') latent_length = 6 inference_window = 6 batch_size = batch_size_per_task * len(tasks) policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with LocalTFRunner(snapshot_config=ctxt) as runner: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def te_ppo_pointenv(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = TASKS latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * len(TASKS) policy_ent_coeff = 1e-3 encoder_ent_coeff = 1e-3 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = 2.0 policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] with TFTrainer(snapshot_config=ctxt) as trainer: task_envs = [ PointEnv(*t_args, **t_kwargs, max_episode_length=100) for t_args, t_kwargs in zip(task_args, task_kwargs) ] env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, std_output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=0.1, output_nonlinearity=tf.nn.tanh, std_output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=TaskEmbeddingWorker) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def test_get_action(self, mock_normal, obs_dim, task_num, latent_dim, action_dim): mock_normal.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch( 'garage.tf.policies.' 'gaussian_mlp_task_embedding_policy.GaussianMLPModel', new=SimpleGaussianMLPModel): embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) env.reset() obs, _, _, _ = env.step(1) latent = np.random.random((latent_dim, )) task = np.zeros(task_num) task[0] = 1 action1, prob1 = policy.get_action_given_latent(obs, latent) action2, prob2 = policy.get_action_given_task(obs, task) action3, prob3 = policy.get_action( np.concatenate([obs.flatten(), task])) expected_action = np.full(action_dim, 0.75) expected_mean = np.full(action_dim, 0.5) expected_log_std = np.full(action_dim, np.log(0.5)) assert env.action_space.contains(action1) assert np.array_equal(action1, expected_action) assert np.array_equal(prob1['mean'], expected_mean) assert np.array_equal(prob1['log_std'], expected_log_std) assert env.action_space.contains(action2) assert np.array_equal(action2, expected_action) assert np.array_equal(prob2['mean'], expected_mean) assert np.array_equal(prob2['log_std'], expected_log_std) assert env.action_space.contains(action3) assert np.array_equal(action3, expected_action) assert np.array_equal(prob3['mean'], expected_mean) assert np.array_equal(prob3['log_std'], expected_log_std) obses, latents, tasks = [obs] * 3, [latent] * 3, [task] * 3 aug_obses = [np.concatenate([obs.flatten(), task])] * 3 action1n, prob1n = policy.get_actions_given_latents(obses, latents) action2n, prob2n = policy.get_actions_given_tasks(obses, tasks) action3n, prob3n = policy.get_actions(aug_obses) for action, mean, log_std in chain( zip(action1n, prob1n['mean'], prob1n['log_std']), zip(action2n, prob2n['mean'], prob2n['log_std']), zip(action3n, prob3n['mean'], prob3n['log_std'])): assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(mean, expected_mean) assert np.array_equal(log_std, expected_log_std)
def test_dist_info(self, obs_dim, task_num, latent_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch( 'garage.tf.policies.' 'gaussian_mlp_task_embedding_policy.GaussianMLPModel', new=SimpleGaussianMLPModel): embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) env.reset() obs, _, _, _ = env.step(1) task = np.zeros(task_num) task[0] = 1 aug_obs = np.concatenate([obs.flatten(), task]) latent = np.random.random(latent_dim) obs_dim = env.spec.observation_space.flat_dim obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim)) task_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, task_num)) latent_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, latent_dim)) aug_obs_ph = tf.compat.v1.concat([obs_ph, task_ph], axis=1) dist0_sym = policy.dist_info_sym(aug_obs_ph, name='p0_sym') dist1_sym = policy.dist_info_sym_given_task(obs_ph, task_ph, name='p1_sym') dist2_sym = policy.dist_info_sym_given_latent(obs_ph, latent_ph, name='p2_sym') # flatten output expected_mean = [np.full(np.prod(action_dim), 0.5)] expected_log_std = [np.full(np.prod(action_dim), np.log(0.5))] prob0 = self.sess.run(dist0_sym, feed_dict={aug_obs_ph: [aug_obs.flatten()]}) prob1 = self.sess.run(dist1_sym, feed_dict={ obs_ph: [obs.flatten()], task_ph: [task] }) prob2 = self.sess.run(dist2_sym, feed_dict={ obs_ph: [obs.flatten()], latent_ph: [latent] }) prob3 = policy.dist_info(aug_obs) assert np.array_equal(prob0['mean'].flatten(), expected_mean[0]) assert np.array_equal(prob0['log_std'].flatten(), expected_log_std[0]) assert np.array_equal(prob1['mean'], expected_mean) assert np.array_equal(prob1['log_std'], expected_log_std) assert np.array_equal(prob2['mean'], expected_mean) assert np.array_equal(prob2['log_std'], expected_log_std) assert np.array_equal(prob3['mean'].flatten(), expected_mean[0]) assert np.array_equal(prob3['log_std'].flatten(), expected_log_std[0])