def get_env_spec(cls, env_spec, latent_dim, module, use_information_bottleneck=False): """Get environment specs of encoder with latent dimension. Args: env_spec (garage.envs.EnvSpec): Environment specs. latent_dim (int): Latent dimension. module (str): Module to get environment specs for. Returns: garage.envs.InOutSpec: Module environment specs with latent dimension. """ obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) if module == 'encoder': in_dim = obs_dim + action_dim + 1 out_dim = latent_dim if use_information_bottleneck: out_dim = out_dim * 2 elif module == 'vf': in_dim = obs_dim out_dim = latent_dim in_space = akro.Box(low=-1, high=1, shape=(in_dim, ), dtype=np.float32) out_space = akro.Box(low=-1, high=1, shape=(out_dim, ), dtype=np.float32) if module == 'encoder': spec = InOutSpec(in_space, out_space) elif module == 'vf': spec = EnvSpec(in_space, out_space) return spec
def augment_env_spec(cls, env_spec, latent_dim): """Augment environment by a size of latent dimension. Args: env_spec (garage.envs.EnvSpec): Environment specs to be augmented. latent_dim (int): Latent dimension. Returns: garage.envs.EnvSpec: Augmented environment specs. """ obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) aug_obs = akro.Box(low=-1, high=1, shape=(obs_dim + latent_dim, ), dtype=np.float32) aug_act = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) return EnvSpec(aug_obs, aug_act)
def setup_method(self): """Setup for all test methods.""" self.latent_dim = 5 self.env_spec = GymEnv(DummyBoxEnv()) latent_space = akro.Box(low=-1, high=1, shape=(self.latent_dim, ), dtype=np.float32) # add latent space to observation space to create a new space augmented_obs_space = akro.Tuple( (self.env_spec.observation_space, latent_space)) augmented_env_spec = EnvSpec(augmented_obs_space, self.env_spec.action_space) self.obs_dim = self.env_spec.observation_space.flat_dim self.action_dim = self.env_spec.action_space.flat_dim reward_dim = 1 self.encoder_input_dim = self.obs_dim + self.action_dim + reward_dim encoder_output_dim = self.latent_dim * 2 encoder_hidden_sizes = (3, 2, encoder_output_dim) context_encoder = MLPEncoder(input_dim=self.encoder_input_dim, output_dim=encoder_output_dim, hidden_nonlinearity=None, hidden_sizes=encoder_hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) context_policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec, hidden_sizes=(3, 5, 7), hidden_nonlinearity=F.relu, output_nonlinearity=None) self.module = ContextConditionedPolicy(latent_dim=self.latent_dim, context_encoder=context_encoder, policy=context_policy, use_information_bottleneck=True, use_next_obs=False)
def batch_data(): # spaces obs_space = gym.spaces.Box(low=1, high=np.inf, shape=(4, 3, 2), dtype=np.float32) act_space = gym.spaces.MultiDiscrete([2, 5]) env_spec = EnvSpec(obs_space, act_space) # generate data batch_size = 2 obs = np.stack([obs_space.low] * batch_size) next_obs = np.stack([obs_space.low] * batch_size) act = np.stack([[1, 3]] * batch_size) rew = np.arange(batch_size) step_types = np.array([StepType.FIRST, StepType.TERMINAL], dtype=StepType) # env_infos env_infos = dict() env_infos['goal'] = np.stack([[1, 1]] * batch_size) env_infos['foo'] = np.arange(batch_size) # agent_infos agent_infos = dict() agent_infos['prev_action'] = act agent_infos['hidden'] = np.arange(batch_size) return { 'env_spec': env_spec, 'observations': obs, 'next_observations': next_obs, 'actions': act, 'rewards': rew, 'env_infos': env_infos, 'agent_infos': agent_infos, 'step_types': step_types }
def sample_data(): # spaces obs_space = gym.spaces.Box(low=1, high=10, shape=(4, 3, 2), dtype=np.float32) act_space = gym.spaces.MultiDiscrete([2, 5]) env_spec = EnvSpec(obs_space, act_space) # generate data obs = obs_space.sample() next_obs = obs_space.sample() act = act_space.sample() rew = 10.0 step_type = StepType.FIRST # env_infos env_infos = dict() env_infos['goal'] = np.array([[1, 1]]) env_infos['TimeLimit.truncated'] = (step_type == StepType.TIMEOUT) # agent_infos agent_infos = dict() agent_infos['prev_action'] = act return { 'env_spec': env_spec, 'observation': obs, 'next_observation': next_obs, 'action': act, 'reward': rew, 'env_info': env_infos, 'agent_info': agent_infos, 'step_type': step_type, 'episode_info': dict(), }
def __init__(self, env, name=None): """Create a DMControlEnv. Args: env (dm_control.suite.Task): The wrapped dm_control environment. name (str): Name of the environment. """ self._env = env self._name = name or type(env.task).__name__ self._viewer = None self._step_cnt = None self._max_episode_length = self._env._step_limit # action space action_spec = self._env.action_spec() if (len(action_spec.shape) == 1) and (-np.inf in action_spec.minimum or np.inf in action_spec.maximum): self._action_space = akro.Discrete(np.prod(action_spec.shape)) else: self._action_space = akro.Box(low=action_spec.minimum, high=action_spec.maximum, dtype=np.float32) # observation_space flat_dim = _flat_shape(self._env.observation_spec()) self._observation_space = akro.Box(low=-np.inf, high=np.inf, shape=[flat_dim], dtype=np.float32) # spec self._spec = EnvSpec(action_space=self.action_space, observation_space=self.observation_space, max_episode_length=self._max_episode_length)
def spec(self): """EnvSpec: the environment specification.""" return EnvSpec(observation_space=self.observation_space, action_space=self.action_space)
data = [np.sin(matrices[i]) for i in range(100)] obs = [{'observations': [x], 'returns': [np.mean(x)]} for x in data] observations = np.concatenate([p['observations'] for p in obs]) returns = np.concatenate([p['returns'] for p in obs]) returns = returns.reshape((-1, 1)) paths = {'observations': [np.sin(matrices[i]) for i in range(100, 110)]} expected = [[np.mean(x)] for x in paths['observations']] return (obs, observations, returns), (paths, expected) test_env_spec = EnvSpec(observation_space=akro.Box(low=-1, high=1, shape=(10, 10, 3)), action_space=None) class TestGaussianCNNBaseline(TfGraphTestCase): @pytest.mark.large def test_fit_normalized(self): gcr = GaussianCNNBaseline(env_spec=test_env_spec, filters=((3, (3, 3)), (6, (3, 3))), strides=(1, 1), padding='SAME', hidden_sizes=(32, ), adaptive_std=False, use_trust_region=True) train_data, test_data = get_train_test_data()
def spec(self): """EnvSpec: The environment specification.""" return EnvSpec(self.observation_space, self._env.spec.action_space)
def test_methods(): """Test PEARLWorker methods.""" env_spec = GymEnv(DummyBoxEnv()) latent_dim = 5 latent_space = akro.Box(low=-1, high=1, shape=(latent_dim, ), dtype=np.float32) # add latent space to observation space to create a new space augmented_obs_space = akro.Tuple( (env_spec.observation_space, latent_space)) augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space) obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) reward_dim = 1 encoder_input_dim = obs_dim + action_dim + reward_dim encoder_output_dim = latent_dim * 2 encoder_hidden_sizes = (3, 2, encoder_output_dim) context_encoder = MLPEncoder(input_dim=encoder_input_dim, output_dim=encoder_output_dim, hidden_nonlinearity=None, hidden_sizes=encoder_hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec, hidden_sizes=(3, 5, 7), hidden_nonlinearity=F.relu, output_nonlinearity=None) context_policy = ContextConditionedPolicy(latent_dim=latent_dim, context_encoder=context_encoder, policy=policy, use_information_bottleneck=True, use_next_obs=False) max_episode_length = 20 worker1 = PEARLWorker(seed=1, max_episode_length=max_episode_length, worker_number=1) worker1.update_agent(context_policy) worker1.update_env(env_spec) episodes = worker1.rollout() assert episodes.observations.shape == (max_episode_length, obs_dim) assert episodes.actions.shape == (max_episode_length, action_dim) assert episodes.rewards.shape == (max_episode_length, ) worker2 = PEARLWorker(seed=1, max_episode_length=max_episode_length, worker_number=1, deterministic=True, accum_context=True) worker2.update_agent(context_policy) worker2.update_env(env_spec) episodes = worker2.rollout() assert context_policy.context.shape == (1, max_episode_length, encoder_input_dim) assert episodes.observations.shape == (max_episode_length, obs_dim) assert episodes.actions.shape == (max_episode_length, action_dim) assert episodes.rewards.shape == (max_episode_length, )