def __init__(self, env, max_obs_dim=None, random_init=None): super().__init__(env) if random_init is not None: _env = env while (not hasattr(_env, 'active_env')): _env = _env._env _env.active_env.random_init = random_init self._max_obs_dim = max_obs_dim action_space = akro.from_gym(self.env.action_space) observation_space = self._create_rl2_obs_space(env) self._spec = EnvSpec(action_space=action_space, observation_space=observation_space)
def __init__(self, env, task_index, n_total_tasks): assert 0 <= task_index < n_total_tasks super().__init__(env) self._task_index = task_index self._n_total_tasks = n_total_tasks env_lb = self.env.observation_space.low env_ub = self.env.observation_space.high one_hot_ub = np.ones(self._n_total_tasks) one_hot_lb = np.zeros(self._n_total_tasks) self.observation_space = akro.Box(np.concatenate([env_lb, one_hot_lb]), np.concatenate([env_ub, one_hot_ub])) self.__spec = EnvSpec(action_space=self.action_space, observation_space=self.observation_space)
def __init__(self, env=None, env_name=''): # Needed for deserialization self._env_name = env_name self._env = env if env_name: super().__init__(gym.make(env_name)) else: super().__init__(env) self.action_space = akro.from_gym(self.env.action_space) self.observation_space = akro.from_gym(self.env.observation_space) self.__spec = EnvSpec(action_space=self.action_space, observation_space=self.observation_space)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) runner = LocalRunner(snapshot_config) obs_dim = int(np.prod(env[0]().observation_space.shape)) action_dim = int(np.prod(env[0]().action_space.shape)) reward_dim = 1 # instantiate networks encoder_in_dim = obs_dim + action_dim + reward_dim encoder_out_dim = params['latent_size'] * 2 net_size = params['net_size'] context_encoder = MLPEncoder(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=[200, 200, 200]) space_a = akro.Box(low=-1, high=1, shape=(obs_dim + params['latent_size'], ), dtype=np.float32) space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) augmented_env = EnvSpec(space_a, space_b) qf1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32) action_space = akro.Box(low=-1, high=1, shape=(params['latent_size'], ), dtype=np.float32) vf_env = EnvSpec(obs_space, action_space) vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) policy = TanhGaussianMLPPolicy2( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) context_conditioned_policy = ContextConditionedPolicy( latent_dim=params['latent_size'], context_encoder=context_encoder, policy=policy, use_ib=params['use_information_bottleneck'], use_next_obs=params['use_next_obs_in_context'], ) pearlsac = PEARLSAC( env=env, test_env=test_env, policy=context_conditioned_policy, qf1=qf1, qf2=qf2, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'], num_evals=params['num_evals'], num_steps_per_eval=params['num_steps_per_eval'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearlsac.to() runner.setup(algo=pearlsac, env=env, sampler_cls=PEARLSampler, sampler_args=dict(max_path_length=params['max_path_length'])) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def run_metarl(env, test_env, seed, log_dir): """Create metarl model and training.""" deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) runner = LocalRunner(snapshot_config) obs_dim = int(np.prod(env[0]().observation_space.shape)) action_dim = int(np.prod(env[0]().action_space.shape)) reward_dim = 1 # instantiate networks encoder_in_dim = obs_dim + action_dim + reward_dim encoder_out_dim = params['latent_size'] * 2 net_size = params['net_size'] context_encoder = MLPEncoder(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=[200, 200, 200]) space_a = akro.Box(low=-1, high=1, shape=(obs_dim + params['latent_size'], ), dtype=np.float32) space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) augmented_env = EnvSpec(space_a, space_b) qf1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32) action_space = akro.Box(low=-1, high=1, shape=(params['latent_size'], ), dtype=np.float32) vf_env = EnvSpec(obs_space, action_space) vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) policy = TanhGaussianMLPPolicy2( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) context_conditioned_policy = ContextConditionedPolicy( latent_dim=params['latent_size'], context_encoder=context_encoder, policy=policy, use_ib=params['use_information_bottleneck'], use_next_obs=params['use_next_obs_in_context'], ) train_task_names = ML10.get_train_tasks()._task_names test_task_names = ML10.get_test_tasks()._task_names pearlsac = PEARLSAC( env=env, test_env=test_env, policy=context_conditioned_policy, qf1=qf1, qf2=qf2, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'], num_evals=params['num_evals'], num_steps_per_eval=params['num_steps_per_eval'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], train_task_names=train_task_names, test_task_names=test_task_names, ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearlsac.to() tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(algo=pearlsac, env=env, sampler_cls=PEARLSampler, sampler_args=dict(max_path_length=params['max_path_length'])) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size']) dowel_logger.remove_all() return tabular_log_file
def test_module(self, reward_dim, latent_dim, hidden_sizes, updates): """Test all methods.""" env_spec = TfEnv(DummyBoxEnv()) latent_space = akro.Box(low=-1, high=1, shape=(latent_dim, ), dtype=np.float32) # add latent space to observation space to create a new space augmented_obs_space = akro.Tuple( (env_spec.observation_space, latent_space)) augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space) obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) encoder_input_dim = obs_dim + action_dim + reward_dim encoder_output_dim = latent_dim * 2 encoder_hidden_sizes = (3, 2, encoder_output_dim) context_encoder = RecurrentEncoder(input_dim=encoder_input_dim, output_dim=encoder_output_dim, hidden_nonlinearity=None, hidden_sizes=encoder_hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) # policy needs to be able to accept obs_dim + latent_dim as input dim policy = GaussianMLPPolicy(env_spec=augmented_env_spec, hidden_sizes=hidden_sizes, hidden_nonlinearity=F.relu, output_nonlinearity=None) module = ContextConditionedPolicy(latent_dim=latent_dim, context_encoder=context_encoder, policy=policy, use_ib=True, use_next_obs=False) expected_shape = [1, latent_dim] module.reset_belief() assert torch.all(torch.eq(module.z_means, torch.zeros(expected_shape))) assert torch.all(torch.eq(module.z_vars, torch.ones(expected_shape))) module.sample_from_belief() assert all([a == b for a, b in zip(module.z.shape, expected_shape)]) module.detach_z() assert module.z.requires_grad is False context_dict = {} context_dict['observation'] = np.ones(obs_dim) context_dict['action'] = np.ones(action_dim) context_dict['reward'] = np.ones(reward_dim) context_dict['next_observation'] = np.ones(obs_dim) for _ in range(updates): module.update_context(context_dict) assert torch.all( torch.eq(module._context, torch.ones(updates, encoder_input_dim))) context = torch.randn(1, 1, encoder_input_dim) module.infer_posterior(context) assert all([a == b for a, b in zip(module.z.shape, expected_shape)]) t, b = 1, 2 obs = torch.randn((t, b, obs_dim), dtype=torch.float32) policy_output, task_z_out = module.forward(obs, context) assert policy_output is not None expected_shape = [b, latent_dim] assert all([a == b for a, b in zip(task_z_out.shape, expected_shape)]) obs = torch.randn(obs_dim) action = module.get_action(obs) assert len(action) == action_dim kl_div = module.compute_kl_div() assert kl_div != 0
def test_pickleable(self): env_spec = EnvSpec(akro.Box(-1, 1, (1, )), akro.Box(-2, 2, (2, ))) round_trip = pickle.loads(pickle.dumps(env_spec)) assert round_trip assert round_trip.action_space == env_spec.action_space assert round_trip.observation_space == env_spec.observation_space