def test_new_time_step(sample_data): s = TimeStep(**sample_data) assert s.env_spec is sample_data['env_spec'] assert s.observation is sample_data['observation'] assert s.action is sample_data['action'] assert s.reward is sample_data['reward'] assert s.terminal is sample_data['terminal'] assert s.env_info is sample_data['env_info'] assert s.agent_info is sample_data['agent_info'] del s obs_space = akro.Box(low=-1, high=10, shape=(4, 3, 2), dtype=np.float32) act_space = akro.Box(low=-1, high=10, shape=(4, 2), dtype=np.float32) env_spec = EnvSpec(obs_space, act_space) sample_data['env_spec'] = env_spec obs_space = akro.Box(low=-1000, high=1000, shape=(4, 3, 2), dtype=np.float32) act_space = akro.Box(low=-1000, high=1000, shape=(4, 2), dtype=np.float32) sample_data['observation'] = obs_space.sample() sample_data['next_observation'] = obs_space.sample() sample_data['action'] = act_space.sample() s = TimeStep(**sample_data) assert s.observation is sample_data['observation'] assert s.next_observation is sample_data['next_observation'] assert s.action is sample_data['action']
def get_env_spec(cls, env_spec, latent_dim, module): """Get environment specs of encoder with latent dimension. Args: env_spec (metarl.envs.EnvSpec): Environment specs. latent_dim (int): Latent dimension. module (str): Module to get environment specs for. Returns: metarl.envs.InOutSpec: Module environment specs with latent dimension. """ obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) if module == 'encoder': in_dim = obs_dim + action_dim + 1 out_dim = latent_dim * 2 elif module == 'vf': in_dim = obs_dim out_dim = latent_dim in_space = akro.Box(low=-1, high=1, shape=(in_dim, ), dtype=np.float32) out_space = akro.Box(low=-1, high=1, shape=(out_dim, ), dtype=np.float32) if module == 'encoder': spec = InOutSpec(in_space, out_space) elif module == 'vf': spec = EnvSpec(in_space, out_space) return spec
def sample_data(): # spaces obs_space = gym.spaces.Box(low=1, high=10, shape=(4, 3, 2), dtype=np.float32) act_space = gym.spaces.MultiDiscrete([2, 5]) env_spec = EnvSpec(obs_space, act_space) # generate data obs = obs_space.sample() next_obs = obs_space.sample() act = act_space.sample() rew = 10.0 terms = False # env_infos env_infos = dict() env_infos['goal'] = np.array([[1, 1]]) env_infos['TimeLimit.truncated'] = not terms # agent_infos agent_infos = dict() agent_infos['prev_action'] = act return { 'env_spec': env_spec, 'observation': obs, 'next_observation': next_obs, 'action': act, 'reward': rew, 'terminal': terms, 'env_info': env_infos, 'agent_info': agent_infos, }
def test_log_multitask_performance_task_id(): lengths = np.array([10, 5, 1, 1]) batch = TrajectoryBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1], dtype=bool), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool), 'task_id': np.array([1] * 10 + [3] * 5 + [1] + [4]) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_multitask_performance(7, batch, 0.8, { 1: 'env1', 3: 'env2', 4: 'env3', 5: 'env4' }) logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['env1/Iteration'] == 7 assert res['env2/Iteration'] == 7 assert res['env3/Iteration'] == 7 assert res['env4/Iteration'] == 7 assert res['env1/NumTrajs'] == 2 assert res['env2/NumTrajs'] == 1 assert res['env3/NumTrajs'] == 1 assert res['env4/NumTrajs'] == 0 assert math.isclose(res['env1/SuccessRate'], 0.5) assert math.isclose(res['env2/SuccessRate'], 1.0) assert math.isclose(res['env3/SuccessRate'], 1.0) assert math.isnan(res['env4/SuccessRate']) assert math.isnan(res['env4/AverageReturn'])
def __init__(self, env, task_index, n_total_tasks): assert 0 <= task_index < n_total_tasks super().__init__(env) self._task_index = task_index self._n_total_tasks = n_total_tasks env_lb = self.env.observation_space.low env_ub = self.env.observation_space.high one_hot_ub = np.ones(self._n_total_tasks) one_hot_lb = np.zeros(self._n_total_tasks) self.observation_space = akro.Box(np.concatenate([env_lb, one_hot_lb]), np.concatenate([env_ub, one_hot_ub])) self.__spec = EnvSpec(action_space=self.action_space, observation_space=self.observation_space)
def test_act_env_spec_mismatch_time_step(sample_data): with pytest.raises(ValueError, match='action must conform to action_space'): sample_data['action'] = sample_data['action'][:-1] s = TimeStep(**sample_data) del s obs_space = akro.Box(low=1, high=10, shape=(4, 3, 2), dtype=np.float32) act_space = akro.Discrete(5) env_spec = EnvSpec(obs_space, act_space) sample_data['env_spec'] = env_spec with pytest.raises(ValueError, match='action should have the same dimensionality'): sample_data['action'] = sample_data['action'][:-1] s = TimeStep(**sample_data) del s
def test_obs_env_spec_mismatch_time_step(sample_data): with pytest.raises(ValueError, match='observation must conform to observation_space'): sample_data['observation'] = sample_data['observation'][:, :, :1] s = TimeStep(**sample_data) del s obs_space = akro.Box(low=1, high=10, shape=(4, 5, 2), dtype=np.float32) act_space = gym.spaces.MultiDiscrete([2, 5]) env_spec = EnvSpec(obs_space, act_space) sample_data['env_spec'] = env_spec with pytest.raises( ValueError, match='observation should have the same dimensionality'): sample_data['observation'] = sample_data['observation'][:, :, :1] s = TimeStep(**sample_data) del s
def test_log_performance(): lengths = np.array([10, 5, 1, 1]) batch = TrajectoryBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1], dtype=bool), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_performance(7, batch, 0.8, prefix='test_log_performance') logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['test_log_performance/Iteration'] == 7 assert res['test_log_performance/NumTrajs'] == 4 assert math.isclose(res['test_log_performance/SuccessRate'], 0.75) assert math.isclose(res['test_log_performance/CompletionRate'], 0.5) assert math.isclose(res['test_log_performance/AverageDiscountedReturn'], 1.1131040640673113) assert math.isclose(res['test_log_performance/AverageReturn'], 2.1659965525) assert math.isclose(res['test_log_performance/StdReturn'], 2.354067152038576)
def traj_data(): # spaces obs_space = gym.spaces.Box(low=1, high=np.inf, shape=(4, 3, 2), dtype=np.float32) act_space = gym.spaces.MultiDiscrete([2, 5]) env_spec = EnvSpec(obs_space, act_space) # generate data lens = np.array([10, 20, 7, 25, 25, 40, 10, 5]) n_t = lens.sum() obs = np.stack([obs_space.low] * n_t) last_obs = np.stack([obs_space.low] * len(lens)) act = np.stack([[1, 3]] * n_t) rew = np.arange(n_t) terms = np.zeros(n_t, dtype=np.bool) terms[np.cumsum(lens) - 1] = True # set terminal bits # env_infos env_infos = dict() env_infos['goal'] = np.stack([[1, 1]] * n_t) env_infos['foo'] = np.arange(n_t) # agent_infos agent_infos = dict() agent_infos['prev_action'] = act agent_infos['hidden'] = np.arange(n_t) return { 'env_spec': env_spec, 'observations': obs, 'last_observations': last_obs, 'actions': act, 'rewards': rew, 'terminals': terms, 'env_infos': env_infos, 'agent_infos': agent_infos, 'lengths': lens, }
def augment_env_spec(cls, env_spec, latent_dim): """Augment environment by a size of latent dimension. Args: env_spec (metarl.envs.EnvSpec): Environment specs to be augmented. latent_dim (int): Latent dimension. Returns: metarl.envs.EnvSpec: Augmented environment specs. """ obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) aug_obs = akro.Box(low=-1, high=1, shape=(obs_dim + latent_dim, ), dtype=np.float32) aug_act = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) return EnvSpec(aug_obs, aug_act)
def run_task(snapshot_config, v, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ v = SimpleNamespace(**v) task_names = sorted(v.tasks.keys()) task_args = [v.tasks[t]['args'] for t in task_names] task_kwargs = [v.tasks[t]['kwargs'] for t in task_names] with LocalTFRunner(snapshot_config=snapshot_config) as runner: task_env_cls = PointEnv task_envs = [ TfEnv(task_env_cls(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] env = MultiEnvWrapper( task_envs, round_robin_strategy, ) # Latent space and embedding specs # TODO(gh/10): this should probably be done in Embedding or Algo latent_lb = np.zeros(v.latent_length, ) latent_ub = np.ones(v.latent_length, ) latent_space = akro.Box(latent_lb, latent_ub) # trajectory space is (TRAJ_ENC_WINDOW, act_obs) where act_obs is a stacked # vector of flattened actions and observations act_lb, act_ub = env.action_space.bounds act_lb_flat = env.action_space.flatten(act_lb) act_ub_flat = env.action_space.flatten(act_ub) obs_lb, obs_ub = env.observation_space.bounds obs_lb_flat = env.observation_space.flatten(obs_lb) obs_ub_flat = env.observation_space.flatten(obs_ub) # act_obs_lb = np.concatenate([act_lb_flat, obs_lb_flat]) # act_obs_ub = np.concatenate([act_ub_flat, obs_ub_flat]) act_obs_lb = obs_lb_flat act_obs_ub = obs_ub_flat # act_obs_lb = act_lb_flat # act_obs_ub = act_ub_flat traj_lb = np.stack([act_obs_lb] * v.inference_window) traj_ub = np.stack([act_obs_ub] * v.inference_window) traj_space = akro.Box(traj_lb, traj_ub) task_embed_spec = EmbeddingSpec(env.task_space, latent_space) traj_embed_spec = EmbeddingSpec(traj_space, latent_space) task_obs_space = concat_spaces(env.task_space, env.observation_space) env_spec_embed = EnvSpec(task_obs_space, env.action_space) inference = GaussianMLPEmbedding( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), # was the same size as policy in Karol's paper std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=v.embedding_min_std, ) # Embeddings task_embedding = GaussianMLPEmbedding( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=v.embedding_init_std, max_std=v.embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=v.embedding_min_std, ) # Multitask policy policy = GaussianMLPMultitaskPolicy( name='policy', env_spec=env.spec, task_space=env.task_space, embedding=task_embedding, hidden_sizes=(32, 16), std_share_network=True, max_std=v.policy_max_std, init_std=v.policy_init_std, min_std=v.policy_min_std, ) baseline = MultiTaskLinearFeatureBaseline(env_spec=env.spec) algo = PPOTaskEmbedding(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, max_path_length=v.max_path_length, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=v.policy_ent_coeff, embedding_ent_coeff=v.embedding_ent_coeff, inference_ce_coeff=v.inference_ce_coeff, entropy_method='max', stop_entropy_gradient=True, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_epochs=10, ), inference_optimizer_args=dict( batch_size=32, max_epochs=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=600, batch_size=v.batch_size, plot=False)
def __init__(self, env): super().__init__(env) action_space = akro.from_gym(self.env.action_space) observation_space = self._create_rl2_obs_space() self._spec = EnvSpec(action_space=action_space, observation_space=observation_space)
def __init__(self, random=True): super().__init__(random) self.spec = EnvSpec(action_space=self.action_space, observation_space=self.observation_space)
def test_methods(): """Test PEARLWorker methods.""" env_spec = MetaRLEnv(DummyBoxEnv()) latent_dim = 5 latent_space = akro.Box(low=-1, high=1, shape=(latent_dim, ), dtype=np.float32) # add latent space to observation space to create a new space augmented_obs_space = akro.Tuple( (env_spec.observation_space, latent_space)) augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space) obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) reward_dim = 1 encoder_input_dim = obs_dim + action_dim + reward_dim encoder_output_dim = latent_dim * 2 encoder_hidden_sizes = (3, 2, encoder_output_dim) context_encoder = MLPEncoder(input_dim=encoder_input_dim, output_dim=encoder_output_dim, hidden_nonlinearity=None, hidden_sizes=encoder_hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec, hidden_sizes=(3, 5, 7), hidden_nonlinearity=F.relu, output_nonlinearity=None) context_policy = ContextConditionedPolicy(latent_dim=latent_dim, context_encoder=context_encoder, policy=policy, use_information_bottleneck=True, use_next_obs=False) max_path_length = 20 worker1 = PEARLWorker(seed=1, max_path_length=max_path_length, worker_number=1) worker1.update_agent(context_policy) worker1.update_env(env_spec) rollouts = worker1.rollout() assert rollouts.observations.shape == (max_path_length, obs_dim) assert rollouts.actions.shape == (max_path_length, action_dim) assert rollouts.rewards.shape == (max_path_length, ) worker2 = PEARLWorker(seed=1, max_path_length=max_path_length, worker_number=1, deterministic=True, accum_context=True) worker2.update_agent(context_policy) worker2.update_env(env_spec) rollouts = worker2.rollout() assert context_policy.context.shape == (1, max_path_length, encoder_input_dim) assert rollouts.observations.shape == (max_path_length, obs_dim) assert rollouts.actions.shape == (max_path_length, action_dim) assert rollouts.rewards.shape == (max_path_length, )