class BitFlippingEnv(gym.Env): """ bit-flipping environment: https://arxiv.org/abs/1707.01495 * Environment has n-bit state. * Initial state and goal state are randomly selected. * Action is one of the 0, ..., n-1, which flips single bit * Reward is 0 if state == goal, otherwise reward is -1. (Sparse Binary Reward) Simple RL algorithms tend to fail for large ``n`` like ``n > 40`` """ def __init__(self, n): seeds = np.random.SeedSequence().spawn(3) self.np_random = np.random.default_rng(seeds[0]) self.observation_space = Box(low=0, high=1, shape=(n,), dtype=int) self.action_space = Discrete(self.n) self.observation_space.seed(seeds[1].entropy) self.action_space.seed(seeds[2].entropy) def step(self, action): action = int(action) self.bit[action] = 1 - self.bit[action] done = (self.bit == self.goal).all() rew = 0 if done else -1 return self.bit.copy(), rew, done, {} def reset(self): self.bit = self.np_random.integers(low=0, high=1, size=self.action_space.n, endpoint=True, dtype=int) self.goal = self.np_random.integers(low=0, high=1, size=self.action_space.n, endpoint=True, dtype=int) return self.bit.copy()
def test_gumbel_softmax(self): """Tests the GumbelSoftmax ActionDistribution (tf + eager only).""" for fw, sess in framework_iterator(frameworks=("tf2", "tf", "tfe"), session=True): batch_size = 1000 num_categories = 5 input_space = Box(-1.0, 1.0, shape=(batch_size, num_categories)) input_space.seed(42) # Batch of size=n and deterministic. inputs = input_space.sample() gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0) expected = softmax(inputs) # Sample n times, expect always mean value (deterministic draw). out = gumbel_softmax.deterministic_sample() check(out, expected) # Batch of size=n and non-deterministic -> expect roughly that # the max-likelihood (argmax) ints are output (most of the time). inputs = input_space.sample() gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0) expected_mean = np.mean(np.argmax(inputs, -1)).astype(np.float32) outs = gumbel_softmax.sample() if sess: outs = sess.run(outs) check(np.mean(np.argmax(outs, -1)), expected_mean, rtol=0.08)
def test_backward() -> None: """ Test backward(). We just want to make sure that the gradient with respect to the i-th task loss is zero for all parameters in output head j != i, and is nonzero for all parameters in output head i. """ # Set up case. dim = SETTINGS["obs_dim"] + SETTINGS["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = MultiTaskTrunkNetwork( input_size=dim, output_size=dim, num_tasks=SETTINGS["num_tasks"], num_shared_layers=SETTINGS["num_shared_layers"], num_task_layers=SETTINGS["num_task_layers"], hidden_size=hidden_size, downscale_last_layer=True, device=SETTINGS["device"], ) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=SETTINGS["num_processes"], obs_space=observation_subspace, num_tasks=SETTINGS["num_tasks"], ) # Make sure every task gets at least one process. assert set(task_indices.tolist()) == set(range(SETTINGS["num_tasks"])) # Get output of network. output = network(obs, task_indices) # Compute losses (we just compute the squared network output to keep it simple) and # test gradients. for i in range(SETTINGS["num_tasks"]): # Zero out gradients. network.zero_grad() # Compute loss over outputs from the current task. loss = torch.zeros(1) for process in range(obs.shape[0]): j = task_indices[process].item() if i == j: loss += torch.sum(output[process]**2) # Test gradients. loss.backward(retain_graph=True) check_gradients(network.trunk, nonzero=True) for j in range(SETTINGS["num_tasks"]): nonzero = j == i check_gradients(network.output_heads[j], nonzero=nonzero)
def test_categorical(self): batch_size = 10000 num_categories = 4 # Create categorical distribution with n categories. inputs_space = Box( -1.0, 2.0, shape=(batch_size, num_categories), dtype=np.float32) inputs_space.seed(42) values_space = Box( 0, num_categories - 1, shape=(batch_size, ), dtype=np.int32) values_space.seed(42) inputs = inputs_space.sample() for fw, sess in framework_iterator(session=True): # Create the correct distribution object. cls = JAXCategorical if fw == "jax" else Categorical if \ fw != "torch" else TorchCategorical categorical = cls(inputs, {}) # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test( cls, inputs_space.shape, fw=fw, sess=sess, bounds=(0, num_categories - 1)) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = categorical.sample() check( np.mean(out) if fw == "jax" else tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0) # Test log-likelihood outputs. probs = softmax(inputs) values = values_space.sample() out = categorical.logp(values if fw != "torch" else torch.Tensor(values)) expected = [] for i in range(batch_size): expected.append(np.sum(np.log(np.array(probs[i][values[i]])))) check(out, expected, decimals=4) # Test entropy outputs. out = categorical.entropy() expected_entropy = -np.sum(probs * np.log(probs), -1) check(out, expected_entropy)
def test_beta(self): input_space = Box(-2.0, 1.0, shape=(2000, 10)) input_space.seed(42) low, high = -1.0, 2.0 plain_beta_value_space = Box(0.0, 1.0, shape=(2000, 5)) plain_beta_value_space.seed(42) for fw, sess in framework_iterator(session=True): cls = TorchBeta if fw == "torch" else Beta inputs = input_space.sample() beta_distribution = cls(inputs, {}, low=low, high=high) inputs = beta_distribution.inputs if sess: inputs = sess.run(inputs) else: inputs = inputs.numpy() alpha, beta_ = np.split(inputs, 2, axis=-1) # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] expected = (1.0 / (1.0 + beta_ / alpha)) * (high - low) + low # Sample n times, expect always mean value (deterministic draw). out = beta_distribution.deterministic_sample() check(out, expected, rtol=0.01) # Batch of size=n and non-deterministic -> expect roughly the mean. values = beta_distribution.sample() if sess: values = sess.run(values) else: values = values.numpy() self.assertTrue(np.max(values) <= high) self.assertTrue(np.min(values) >= low) check(np.mean(values), expected.mean(), decimals=1) # Test log-likelihood outputs (against scipy). inputs = input_space.sample() beta_distribution = cls(inputs, {}, low=low, high=high) inputs = beta_distribution.inputs if sess: inputs = sess.run(inputs) else: inputs = inputs.numpy() alpha, beta_ = np.split(inputs, 2, axis=-1) values = plain_beta_value_space.sample() values_scaled = values * (high - low) + low if fw == "torch": values_scaled = torch.Tensor(values_scaled) print(values_scaled) out = beta_distribution.logp(values_scaled) check( out, np.sum(np.log(beta.pdf(values, alpha, beta_)), -1), rtol=0.01)
def test_forward() -> None: """ Test forward() when each task-specific output head multiplies the shared trunk output by some constant factor, and the task index is included in the input. """ # Set up case. dim = SETTINGS["obs_dim"] + SETTINGS["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network and set weights of each output head explicitly. We want to make # it so that f_i(x) = x * i + i (with broadcasted operations), where the i-th output # head is f_i. network = MultiTaskTrunkNetwork( input_size=dim, output_size=dim, num_tasks=SETTINGS["num_tasks"], num_shared_layers=SETTINGS["num_shared_layers"], num_task_layers=SETTINGS["num_task_layers"], hidden_size=hidden_size, downscale_last_layer=True, device=SETTINGS["device"], ) for i in range(SETTINGS["num_tasks"]): # Set weights. state_dict = network.output_heads[i].state_dict() state_dict["0.0.weight"] = torch.Tensor(i * np.identity(dim)) state_dict["0.0.bias"] = torch.Tensor(i * np.ones(dim)) network.output_heads[i].load_state_dict(state_dict) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=SETTINGS["num_processes"], obs_space=observation_subspace, num_tasks=SETTINGS["num_tasks"], ) # Get output of network. output = network(obs, task_indices) # Construct expected output of network. trunk_output = network.trunk(obs) expected_output_list = [] for i, task_index in enumerate(task_indices): expected_output_list.append(trunk_output[i] * task_index + task_index) expected_output = torch.stack(expected_output_list) # Test output of network. assert torch.allclose(output, expected_output)
def meta_forward_template( settings: Dict[str, Any], state_dict: Dict[str, torch.Tensor], splits_args: List[Dict[str, Any]], alpha: List[torch.Tensor], get_expected_output: Callable[[torch.Tensor, torch.Tensor], torch.Tensor], ) -> None: """ Test MetaSplittingNetwork.forward() correct computes network output. """ # Construct multi-task network. multitask_network = BaseMultiTaskSplittingNetwork( input_size=settings["input_size"], output_size=settings["output_size"], num_tasks=settings["num_tasks"], num_layers=settings["num_layers"], hidden_size=settings["hidden_size"], device=settings["device"], ) # Split the network according to `splits_args`. for split_args in splits_args: multitask_network.split(**split_args) # Load state dict. multitask_network.load_state_dict(state_dict) # Construct MetaSplittingNetwork from BaseMultiTaskSplittingNetwork. meta_network = MetaSplittingNetwork( multitask_network, num_test_tasks=settings["num_tasks"], device=settings["device"], ) # Set alpha weights of meta network. for layer in range(meta_network.num_layers): meta_network.alpha[layer].data = alpha[layer] # Construct batch of observations concatenated with one-hot task vectors. observation_subspace = Box(low=-np.inf, high=np.inf, shape=(settings["obs_dim"], )) observation_subspace.seed(settings["seed"]) obs, task_indices = get_obs_batch( batch_size=settings["num_processes"], obs_space=observation_subspace, num_tasks=settings["num_tasks"], ) # Get and test output of network. output = meta_network(obs, task_indices) expected_output = get_expected_output(obs, task_indices) assert torch.allclose(output, expected_output)
def grad_diffs_template(settings: Dict[str, Any], grad_type: str) -> None: """ Test that `get_task_grad_diffs()` correctly computes the pairwise difference between task-specific gradients at each region. """ # Set up case. dim = settings["obs_dim"] + settings["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(settings["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=settings["num_tasks"], num_layers=settings["num_layers"], hidden_size=hidden_size, device=settings["device"], ) # Construct dummy task gradients. if grad_type == "zero": task_grads = torch.zeros(network.num_tasks, network.num_regions, network.max_region_size) elif grad_type == "rand_identical": task_grads = torch.rand(1, network.num_regions, network.max_region_size) task_grads = task_grads.expand(network.num_tasks, -1, -1) pass elif grad_type == "rand": pass task_grads = torch.rand(network.num_tasks, network.num_regions, network.max_region_size) else: raise NotImplementedError # Compute pairwise differences of task gradients. task_grad_diffs = network.get_task_grad_diffs(task_grads) # Check computed differences. for task1, task2 in product(range(network.num_tasks), range(network.num_tasks)): for region in range(network.num_regions): expected_diff = torch.sum( torch.pow( task_grads[task1, region] - task_grads[task2, region], 2)) assert torch.allclose(task_grad_diffs[task1, task2, region], expected_diff)
def test_forward_shared() -> None: """ Test forward() when all regions of the splitting network are fully shared. The function computed by the network should be f(x) = 3 * tanh(2 * tanh(x + 1) + 2) + 3. """ # Set up case. dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(BASE_SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=BASE_SETTINGS["num_tasks"], num_layers=BASE_SETTINGS["num_layers"], hidden_size=hidden_size, device=BASE_SETTINGS["device"], ) # Set network weights. state_dict = network.state_dict() for i in range(BASE_SETTINGS["num_layers"]): weight_name = "regions.%d.0.0.weight" % i bias_name = "regions.%d.0.0.bias" % i state_dict[weight_name] = torch.Tensor((i + 1) * np.identity(dim)) state_dict[bias_name] = torch.Tensor((i + 1) * np.ones(dim)) network.load_state_dict(state_dict) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=BASE_SETTINGS["num_processes"], obs_space=observation_subspace, num_tasks=BASE_SETTINGS["num_tasks"], ) # Get output of network. output = network(obs, task_indices) # Computed expected output of network. expected_output = 3 * torch.tanh(2 * torch.tanh(obs + 1) + 2) + 3 # Test output of network. assert torch.allclose(output, expected_output)
class RandomTeacher(AbstractTeacher): def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32) self.random_task_generator.seed(self.seed) def sample_task(self): return self.random_task_generator.sample() def non_exploratory_task_sampling(self): return {"task": self.sample_task(), "infos": { "bk_index": -1, "task_infos": None} }
def test_split_multiple() -> None: """ Test that split() correctly sets new parameters when we perform multiple splits. """ # Set up case. dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(BASE_SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=BASE_SETTINGS["num_tasks"], num_layers=BASE_SETTINGS["num_layers"], hidden_size=hidden_size, device=BASE_SETTINGS["device"], ) # Split the network at the first layer once and the last layer twice. network.split(0, 0, [0, 1], [2, 3]) network.split(2, 0, [0, 2], [1, 3]) network.split(2, 1, [1], [3]) # Check the parameters of the network. param_names = [name for name, param in network.named_parameters()] # Construct expected parameters of network. region_copies = {i: [0] for i in range(BASE_SETTINGS["num_layers"])} region_copies[0].extend([1]) region_copies[2].extend([1, 2]) expected_params = [] for region, copies in region_copies.items(): for copy in copies: expected_params.append("regions.%d.%d.0.weight" % (region, copy)) expected_params.append("regions.%d.%d.0.bias" % (region, copy)) # Test actual parameter names. assert set(param_names) == set(expected_params)
class RandomTeacher(): def __init__(self, mins, maxs, seed=None): self.seed = seed if not seed: self.seed = np.random.randint(42,424242) np.random.seed(self.seed) self.mins = mins self.maxs = maxs self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32) self.random_task_generator.seed(self.seed) def update(self, task, competence): pass def sample_task(self): return self.random_task_generator.sample() def dump(self, dump_dict): return dump_dict
def test_seed_Dict(): test_space = Dict( { "a": Box(low=0, high=1, shape=(3, 3)), "b": Dict( { "b_1": Box(low=-100, high=100, shape=(2,)), "b_2": Box(low=-1, high=1, shape=(2,)), } ), "c": Discrete(5), } ) seed_dict = { "a": 0, "b": { "b_1": 1, "b_2": 2, }, "c": 3, } test_space.seed(seed_dict) # "Unpack" the dict sub-spaces into individual spaces a = Box(low=0, high=1, shape=(3, 3)) a.seed(0) b_1 = Box(low=-100, high=100, shape=(2,)) b_1.seed(1) b_2 = Box(low=-1, high=1, shape=(2,)) b_2.seed(2) c = Discrete(5) c.seed(3) for i in range(10): test_s = test_space.sample() a_s = a.sample() assert (test_s["a"] == a_s).all() b_1_s = b_1.sample() assert (test_s["b"]["b_1"] == b_1_s).all() b_2_s = b_2.sample() assert (test_s["b"]["b_2"] == b_2_s).all() c_s = c.sample() assert test_s["c"] == c_s
def test_multi_categorical(self): batch_size = 100 num_categories = 3 num_sub_distributions = 5 # Create 5 categorical distributions of 3 categories each. inputs_space = Box(-1.0, 2.0, shape=(batch_size, num_sub_distributions * num_categories)) inputs_space.seed(42) values_space = Box( 0, num_categories - 1, shape=(num_sub_distributions, batch_size), dtype=np.int32, ) values_space.seed(42) inputs = inputs_space.sample() input_lengths = [num_categories] * num_sub_distributions inputs_split = np.split(inputs, num_sub_distributions, axis=1) for fw, sess in framework_iterator(session=True): # Create the correct distribution object. cls = MultiCategorical if fw != "torch" else TorchMultiCategorical multi_categorical = cls(inputs, None, input_lengths) # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test( cls, inputs_space.shape, fw=fw, sess=sess, bounds=(0, num_categories - 1), extra_kwargs={"input_lens": input_lengths}, ) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs_split, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = multi_categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = multi_categorical.sample() check( tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0, ) # Test log-likelihood outputs. probs = softmax(inputs_split) values = values_space.sample() out = multi_categorical.logp(values if fw != "torch" else [ torch.Tensor(values[i]) for i in range(num_sub_distributions) ]) # v in np.stack(values, 1)]) expected = [] for i in range(batch_size): expected.append( np.sum( np.log( np.array([ probs[j][i][values[j][i]] for j in range(num_sub_distributions) ])))) check(out, expected, decimals=4) # Test entropy outputs. out = multi_categorical.entropy() expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1) check(out, expected_entropy)
class ALPGMM(AbstractTeacher): def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, gmm_fitness_func="aic", warm_start=False, nb_em_init=1, fit_rate=250, alp_max_size=None, alp_buffer_size=500, potential_ks=np.arange(2, 11, 1), random_task_ratio=0.2, nb_bootstrap=None, initial_dist=None): ''' Absolute Learning Progress - Gaussian Mixture Model (https://arxiv.org/abs/1910.07224). Args: gmm_fitness_func: Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians. warm_start: Restart new fit by initializing with last fit nb_em_init: Number of Expectation-Maximization trials when fitting fit_rate: Number of episodes between two fit of the GMM alp_max_size: Maximum number of episodes stored alp_buffer_size: Maximal number of episodes to account for when computing ALP potential_ks: Range of number of Gaussians to try when fitting the GMM random_task_ratio: Ratio of randomly sampled tasks VS tasks sampling using GMM nb_bootstrap: Number of bootstrapping episodes, must be >= to fit_rate initial_dist: Initial Gaussian distribution. If None, bootstrap with random tasks ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) # Range of number of Gaussians to try when fitting the GMM self.potential_ks = potential_ks # Restart new fit by initializing with last fit self.warm_start = warm_start # Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians. self.gmm_fitness_func = gmm_fitness_func # Number of Expectation-Maximization trials when fitting self.nb_em_init = nb_em_init # Number of episodes between two fit of the GMM self.fit_rate = fit_rate self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate # Number of bootstrapping episodes, must be >= to fit_rate self.initial_dist = initial_dist # Initial Gaussian distribution. If None, bootstrap with random tasks # Ratio of randomly sampled tasks VS tasks sampling using GMM self.random_task_ratio = random_task_ratio self.random_task_generator = Box(self.mins, self.maxs, dtype=np.float32) self.random_task_generator.seed(self.seed) # Maximal number of episodes to account for when computing ALP alp_max_size = alp_max_size alp_buffer_size = alp_buffer_size # Init ALP computer self.alp_computer = EmpiricalALPComputer(len(mins), max_size=alp_max_size, buffer_size=alp_buffer_size) self.tasks = [] self.alps = [] self.tasks_alps = [] # Init GMMs self.potential_gmms = [self.init_gmm(k) for k in self.potential_ks] self.gmm = None # Boring book-keeping self.bk = { 'weights': [], 'covariances': [], 'means': [], 'tasks_alps': [], 'tasks_lps': [], 'episodes': [], 'tasks_origin': [] } def init_gmm(self, nb_gaussians): ''' Init the GMM given the number of gaussians. ''' return GMM(n_components=nb_gaussians, covariance_type='full', random_state=self.seed, warm_start=self.warm_start, n_init=self.nb_em_init) def get_nb_gmm_params(self, gmm): ''' Assumes full covariance. See https://stats.stackexchange.com/questions/229293/the-number-of-parameters-in-gaussian-mixture-model ''' nb_gmms = gmm.get_params()['n_components'] d = len(self.mins) params_per_gmm = (d * d - d) / 2 + 2 * d + 1 return nb_gmms * params_per_gmm - 1 def episodic_update(self, task, reward, is_success): self.tasks.append(task) is_update_time = False # Compute corresponding ALP alp, lp = self.alp_computer.compute_alp(task, reward) self.alps.append(alp) # Concatenate task vector with ALP dimension self.tasks_alps.append(np.array(task.tolist() + [self.alps[-1]])) if len(self.tasks ) >= self.nb_bootstrap: # If initial bootstrapping is done if (len(self.tasks) % self.fit_rate) == 0: # Time to fit is_update_time = True # 1 - Retrieve last <fit_rate> (task, reward) pairs cur_tasks_alps = np.array(self.tasks_alps[-self.fit_rate:]) # 2 - Fit batch of GMMs with varying number of Gaussians self.potential_gmms = [ g.fit(cur_tasks_alps) for g in self.potential_gmms ] # 3 - Compute fitness and keep best GMM fitnesses = [] if self.gmm_fitness_func == 'bic': # Bayesian Information Criterion fitnesses = [ m.bic(cur_tasks_alps) for m in self.potential_gmms ] elif self.gmm_fitness_func == 'aic': # Akaike Information Criterion fitnesses = [ m.aic(cur_tasks_alps) for m in self.potential_gmms ] elif self.gmm_fitness_func == 'aicc': # Modified AIC n = self.fit_rate fitnesses = [] for l, m in enumerate(self.potential_gmms): k = self.get_nb_gmm_params(m) penalty = (2 * k * (k + 1)) / (n - k - 1) fitnesses.append(m.aic(cur_tasks_alps) + penalty) else: raise NotImplementedError exit(1) self.gmm = self.potential_gmms[np.argmin(fitnesses)] # book-keeping self.bk['weights'].append(self.gmm.weights_.copy()) self.bk['covariances'].append(self.gmm.covariances_.copy()) self.bk['means'].append(self.gmm.means_.copy()) self.bk['tasks_alps'] = self.tasks_alps self.bk['tasks_lps'].append(lp) self.bk['episodes'].append(len(self.tasks)) return is_update_time def sample_task(self): task_origin = None if len(self.tasks) < self.nb_bootstrap or self.random_state.random( ) < self.random_task_ratio or self.gmm is None: if self.initial_dist and len( self.tasks ) < self.nb_bootstrap: # bootstrap in initial dist # Expert bootstrap Gaussian task sampling new_task = self.random_state.multivariate_normal( self.initial_dist['mean'], self.initial_dist['variance']) new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) task_origin = -2 # -2 = task originates from initial bootstrap gaussian sampling else: # Random task sampling new_task = self.random_task_generator.sample() task_origin = -1 # -1 = task originates from random sampling else: # ALP-based task sampling # 1 - Retrieve the mean ALP value of each Gaussian in the GMM self.alp_means = [] for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_, self.gmm.weights_): self.alp_means.append(pos[-1]) # 2 - Sample Gaussian proportionally to its mean ALP idx = proportional_choice(self.alp_means, self.random_state, eps=0.0) task_origin = idx # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension new_task = self.random_state.multivariate_normal( self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1] new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) # boring book-keeping self.bk['tasks_origin'].append(task_origin) return new_task def is_non_exploratory_task_sampling_available(self): return self.gmm is not None def non_exploratory_task_sampling(self): # 1 - Retrieve the mean ALP value of each Gaussian in the GMM alp_means = [] for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_, self.gmm.weights_): alp_means.append(pos[-1]) # 2 - Sample Gaussian proportionally to its mean ALP idx = proportional_choice(alp_means, self.random_state, eps=0.0) # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension new_task = self.random_state.multivariate_normal( self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1] new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) return { "task": new_task, "infos": { "bk_index": len(self.bk[list(self.bk.keys())[0]]) - 1, "task_infos": idx } }
def backward_template(settings: Dict[str, Any], splits_args: List[Dict[str, Any]]) -> None: """ Template to test that the backward() function correctly computes gradients. We don't actually compare the gradients against baseline values, instead we just check that the gradient of the loss for task i is non-zero for all copies that i is assigned to, and zero for all copies i isn't assigned to, for each i. To keep things simple, we define each task loss as the squared norm of the output for inputs from the given task. """ # Set up case. dim = settings["obs_dim"] + settings["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(settings["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=settings["num_tasks"], num_layers=settings["num_layers"], hidden_size=hidden_size, device=settings["device"], ) # Split the network according to `splits_args`. for split_args in splits_args: network.split(**split_args) # Re-initialize the new copies so different tasks will actually have different # corresponding functions. state_dict = network.state_dict() for region in range(network.num_regions): for copy in range(1, int(network.splitting_map.num_copies[region])): weight_name = "regions.%d.%d.0.weight" % (region, copy) bias_name = "regions.%d.%d.0.bias" % (region, copy) state_dict[weight_name] = torch.rand(state_dict[weight_name].shape) state_dict[bias_name] = torch.rand(state_dict[bias_name].shape) network.load_state_dict(state_dict) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=settings["num_processes"], obs_space=observation_subspace, num_tasks=settings["num_tasks"], ) # Get output of network and compute task losses. output = network(obs, task_indices) task_losses = {i: None for i in range(settings["num_tasks"])} for task in range(settings["num_tasks"]): for current_out, current_task in zip(output, task_indices): if current_task == task: if task_losses[task] is not None: task_losses[task] += torch.sum(current_out**2) else: task_losses[task] = torch.sum(current_out**2) # Test gradients. for task in range(settings["num_tasks"]): network.zero_grad() if task_losses[task] is None: continue task_losses[task].backward(retain_graph=True) for region in range(len(network.regions)): for copy in range(int(network.splitting_map.num_copies[region])): for param in network.regions[region][copy].parameters(): zero = torch.zeros(param.grad.shape) if network.splitting_map.copy[region, task] == copy: assert not torch.allclose(param.grad, zero) else: assert torch.allclose(param.grad, zero)
def test_actor_worker(config: Dict, seeds: List[np.random.SeedSequence]): """ Function to run an isolated actor workers and manually test it out in the cli. """ from gym.spaces import Box, Discrete from asrel.core.utils import get_actor_args_from_config, take_tensor_from_dict from asrel.core.workers.actor import ActorWorker import asrel.core.workers.events as events actor_args = get_actor_args_from_config(config["actor"]) print(f"Testing Actor Worker with args: {actor_args}") num_workers = actor_args.get("num_workers", 1) input_queue_len = 8 input_space = Box(-10, 10, (6, ), np.float32) input_space.seed(0) output_space = Discrete(3) print("Creating workers...") actor_input_queues = [ mp.Queue(maxsize=input_queue_len) for _ in range(num_workers) ] actor_shared_output_queue = mp.Queue(maxsize=num_workers * input_queue_len) actor_worker_seed_seqs = seeds["actor"].spawn(num_workers) actor_workers = [ ActorWorker( input_queue=actor_input_queues[idx], output_queue=actor_shared_output_queue, seed_seq=actor_worker_seed_seqs[idx], input_space=input_space, output_space=output_space, index=idx, **actor_args, ) for idx in range(num_workers) ] for worker in actor_workers: worker.start() try: while True: task = int( input( "0 - Choose Action, 1 - Sync Networks, 2 - Update Params: " )) if task == 0: worker_idx = int(input(" worker: ", )) num_obs = int(input(" # of obs: ")) obs = torch.tensor( [input_space.sample() for _ in range(num_obs)]).cuda() print(f" obs:\n{obs}") env_worker_idx = int(input(" env worker: ")) env_sub_idx = int(input(" subenv: ")) greedy = input(" greedy (y/n): ").lower() == "y" actor_input_queues[worker_idx].put({ "type": events.ACTOR_CHOOSE_ACTION_TASK, "observation": obs, "greedy": greedy, "env_idx": (env_worker_idx, env_sub_idx), }) out = actor_shared_output_queue.get() out_action = take_tensor_from_dict(out, "action") print(f"worker {worker_idx}:") print({**out, "action": out_action}) elif task == 1: state_dicts = json.loads(input("State Dictionaries: ")) for q in actor_input_queues: q.put({ "type": events.ACTOR_SYNC_NETWORKS_TASK, "state_dicts": state_dicts, }) elif task == 2: params = json.loads(input("Params: ")) for q in actor_input_queues: q.put({ "type": events.ACTOR_UPDATE_PARAMS_TASK, **params, }) except (KeyboardInterrupt, Exception) as e: print() print("Terminating workers...") for worker in actor_workers: worker.terminate() print(e) else: print("Closing worker...") for worker in actor_workers: worker.close() for worker in actor_workers: worker.join()
class ContinuousGridEnv(gym.Env): def __init__(self, r=None, size_x=4, size_y=4, T=50, random_born=False, state_indices=None, random_act_prob=0.0, sigma=1.0, terminal_states=[], seed=0, add_time=False, **kwargs): self.size_x = size_x self.size_y = size_y self.terminal_states = terminal_states self.r = r self.range_x = (0, size_x) self.range_y = (0, size_y) self.random_act_prob = random_act_prob self.sigma = sigma self.state_indices = state_indices self.T = T self.observation_space = Box(low=np.array([0, 0]), high=np.array([size_x, size_y]), dtype=np.float32) self.action_space = Box(low=np.array([-1, -1]), high=np.array([1, 1]), dtype=np.float32) self.seed(seed) self.action_space.seed(seed) self.random_born = random_born def set_reward_function(self, r): self.r = r def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def reset(self, n=1): if self.random_born: self.s = np.random.uniform((0, 0), (self.size_x, self.size_y), size=(n, 2)) else: self.s = np.zeros((n, 2), dtype=np.float32) self.n = n self.t = 0 return self.s.copy() def step(self, action): change_action_prob = (np.random.uniform(0, 1, size=(self.n)) < self.random_act_prob).reshape(-1, 1) action = change_action_prob * (action + self.sigma * np.random.randn(self.n, 2)) \ + (1-change_action_prob) * action self.s += action self.s[:, 0] = np.clip(self.s[:, 0], 0, self.size_x) self.s[:, 1] = np.clip(self.s[:, 1], 0, self.size_y) self.t += 1 done = (self.t >= self.T) if self.r is None: # for adv IRL r = np.zeros((self.n, )) else: # for SMM IRL r = self.r(self.s) return self.s.copy(), r, done, None
def test_forward_single() -> None: """ Test forward() when all regions of the splitting network are fully shared except one. The function computed by the network should be f(x) = 3 * tanh(2 * tanh(x + 1) + 2) + 3 for tasks 0 and 1 and f(x) = 3 * tanh(-2 * tanh(x + 1) - 2) + 3 for tasks 2 and 3. """ # Set up case. dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(BASE_SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=BASE_SETTINGS["num_tasks"], num_layers=BASE_SETTINGS["num_layers"], hidden_size=hidden_size, device=BASE_SETTINGS["device"], ) # Split the network at the second layer. Tasks 0 and 1 stay assigned to the original # copy and tasks 2 and 3 are assigned to the new copy. network.split(1, 0, [0, 1], [2, 3]) # Set network weights. state_dict = network.state_dict() for i in range(BASE_SETTINGS["num_layers"]): weight_name = "regions.%d.0.0.weight" % i bias_name = "regions.%d.0.0.bias" % i state_dict[weight_name] = torch.Tensor((i + 1) * np.identity(dim)) state_dict[bias_name] = torch.Tensor((i + 1) * np.ones(dim)) weight_name = "regions.1.1.0.weight" bias_name = "regions.1.1.0.bias" state_dict[weight_name] = torch.Tensor(-2 * np.identity(dim)) state_dict[bias_name] = torch.Tensor(-2 * np.ones(dim)) network.load_state_dict(state_dict) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=BASE_SETTINGS["num_processes"], obs_space=observation_subspace, num_tasks=BASE_SETTINGS["num_tasks"], ) # Get output of network. output = network(obs, task_indices) # Computed expected output of network. expected_output = torch.zeros(obs.shape) for i, (ob, task) in enumerate(zip(obs, task_indices)): if task in [0, 1]: expected_output[i] = 3 * torch.tanh(2 * torch.tanh(ob + 1) + 2) + 3 elif task in [2, 3]: expected_output[i] = 3 * torch.tanh(-2 * torch.tanh(ob + 1) - 2) + 3 else: raise NotImplementedError # Test output of network. assert torch.allclose(output, expected_output)
def gradients_template(settings: Dict[str, Any], splits_args: List[Dict[str, Any]]) -> None: """ Template to test that `get_task_grads()` correctly computes task-specific gradients at each region of the network. For simplicity we compute the loss as half of the squared norm of the output, and we make the following assumptions: each layer has the same size, the activation function is Tanh for each layer, and the final layer has no activation. """ # Set up case. dim = settings["obs_dim"] + settings["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(settings["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=settings["num_tasks"], num_layers=settings["num_layers"], hidden_size=hidden_size, device=settings["device"], ) # Split the network according to `splits_args`. for split_args in splits_args: network.split(**split_args) # Re-initialize the new copies so different tasks will actually have different # corresponding functions. state_dict = network.state_dict() for region in range(network.num_regions): for copy in range(1, int(network.splitting_map.num_copies[region])): weight_name = "regions.%d.%d.0.weight" % (region, copy) bias_name = "regions.%d.%d.0.bias" % (region, copy) state_dict[weight_name] = torch.rand(state_dict[weight_name].shape) state_dict[bias_name] = torch.rand(state_dict[bias_name].shape) network.load_state_dict(state_dict) # Register forward hooks to get activations later from each copy of each region. activation = {} def get_activation(name): def hook(model, ins, outs): activation[name] = outs.detach() return hook for region in range(network.num_regions): for copy in range(int(network.splitting_map.num_copies[region])): name = "regions.%d.%d" % (region, copy) network.regions[region][copy].register_forward_hook( get_activation(name)) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=settings["num_processes"], obs_space=observation_subspace, num_tasks=settings["num_tasks"], ) # Get output of network and compute task gradients. output = network(obs, task_indices) task_losses = torch.zeros(settings["num_tasks"]) for task in range(settings["num_tasks"]): for current_out, current_task in zip(output, task_indices): if current_task == task: task_losses[task] += 0.5 * torch.sum(current_out**2) task_grads = network.get_task_grads(task_losses) def get_task_activations(r, t, tasks): """ Helper function to get activations from specific regions. """ c = network.splitting_map.copy[r, t] copy_indices = network.splitting_map.copy[r, tasks] sorted_copy_indices, copy_permutation = torch.sort(copy_indices) sorted_tasks = tasks[copy_permutation] batch_indices = (sorted_copy_indices == c).nonzero().squeeze(-1) task_batch_indices = sorted_tasks[batch_indices] current_task_indices = (task_batch_indices == t).nonzero().squeeze(-1) activations = activation["regions.%d.%d" % (r, c)][current_task_indices] return activations # Compute expected gradients. state_dict = network.state_dict() expected_task_grads = torch.zeros( (settings["num_tasks"], network.num_regions, network.max_region_size)) for task in range(settings["num_tasks"]): # Get output from current task. task_input_indices = (task_indices == task).nonzero().squeeze(-1) task_output = output[task_input_indices] # Clear local gradients. local_grad = {} for region in reversed(range(network.num_regions)): # Get copy index and layer input. copy = network.splitting_map.copy[region, task] if region > 0: layer_input = get_task_activations(region - 1, task, task_indices) else: layer_input = obs[task_input_indices] # Compute local gradient first. if region == network.num_regions - 1: local_grad[region] = -task_output else: layer_output = get_task_activations(region, task, task_indices) local_grad[region] = torch.zeros(len(layer_output), dim) next_copy = network.splitting_map.copy[region + 1, task] weights = state_dict["regions.%d.%d.0.weight" % (region + 1, next_copy)] for i in range(dim): for j in range(dim): local_grad[region][:, i] += ( local_grad[region + 1][:, j] * weights[j, i]) local_grad[region] = local_grad[region] * (1 - layer_output**2) # Compute gradient from local gradients. grad = torch.zeros(dim, dim + 1) for i in range(dim): for j in range(dim): grad[i, j] = torch.sum(-local_grad[region][:, i] * layer_input[:, j]) grad[i, dim] = torch.sum(-local_grad[region][:, i]) # Rearrange weights and biases. Should be all weights, then all biases. weights = torch.reshape(grad[:, :-1], (-1, )) biases = torch.reshape(grad[:, -1], (-1, )) grad = torch.cat([weights, biases]) expected_task_grads[task, region, :len(grad)] = grad # Test gradients. assert torch.allclose(task_grads, expected_task_grads, atol=2e-5)
def meta_backward_template( settings: Dict[str, Any], splits_args: List[Dict[str, Any]], alpha: List[torch.Tensor], ) -> None: """ Template to test that the backward() function correctly computes gradients. We don't actually compare the gradients against baseline values, instead we just check that the gradients are non-zero for each of the alpha values and zero for the parameters in each region. """ # Construct multi-task network. multitask_network = BaseMultiTaskSplittingNetwork( input_size=settings["input_size"], output_size=settings["output_size"], num_tasks=settings["num_tasks"], num_layers=settings["num_layers"], hidden_size=settings["hidden_size"], device=settings["device"], ) # Split the network according to `splits_args`. for split_args in splits_args: multitask_network.split(**split_args) # Construct MetaSplittingNetwork from BaseMultiTaskSplittingNetwork. meta_network = MetaSplittingNetwork( multitask_network, num_test_tasks=settings["num_tasks"], device=settings["device"], ) # Set alpha weights of meta network. for layer in range(meta_network.num_layers): meta_network.alpha[layer].data = alpha[layer] # Construct batch of observations concatenated with one-hot task vectors. observation_subspace = Box(low=-np.inf, high=np.inf, shape=(settings["obs_dim"], )) observation_subspace.seed(settings["seed"]) obs, task_indices = get_obs_batch( batch_size=settings["num_processes"], obs_space=observation_subspace, num_tasks=settings["num_tasks"], ) # Get output, compute a dummy loss, and perform backwards call. output = meta_network(obs, task_indices) loss = torch.sum(output**2) meta_network.zero_grad() loss.backward() # Check that gradients of alpha values are non-zero. batch_tasks = task_indices.tolist() for layer in range(meta_network.num_layers): for task in range(meta_network.num_test_tasks): grad = meta_network.alpha[layer].grad[:, task] assert grad is not None if task in batch_tasks: assert torch.all(grad != 0) else: assert torch.all(grad == 0) # Check that gradients of regions are zero. for region in range(meta_network.num_regions): for copy in range(int(meta_network.splitting_map.num_copies[region])): for param in meta_network.regions[region][copy].parameters(): assert param.grad is None
class ADR(AbstractTeacher): def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, step_size, max_reward_thr, min_reward_thr, initial_dist=None, boundary_sampling_p=0.5, queue_len=10, scale_reward=False): ''' Automatic Domain Randomization (https://arxiv.org/abs/1910.07113). Args: step_size: Size of the growth (or decrease) of a bound at update max_reward_thr: Upper reward threshold used to inflate distribution min_reward_thr: Lowers reward threshold used to deflate distribution initial_dist: The mean of this initial distribution is used as the initial task used by ADR boundary_sampling_p: Probability to sample a dimension at a bound queue_len: Size of the queue associated to each bound. Once reached, ADR increases or decreases the bound. ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.nb_dims = len(self.mins) # Boundary sampling probability p_r self.bound_sampling_p = boundary_sampling_p # ADR step size self.step_size = step_size # Max reward threshold, sampling distribution inflates if mean reward above this self.max_reward_threshold = max_reward_thr if scale_reward: self.max_reward_threshold = np.interp( self.max_reward_threshold, (self.env_reward_lb, self.env_reward_ub), (0, 1)) # Min reward threshold, sampling distribution deflates if mean reward below this self.min_reward_threshold = min_reward_thr if scale_reward: self.min_reward_threshold = np.interp( self.min_reward_threshold, (self.env_reward_lb, self.env_reward_ub), (0, 1)) # max queue length self.window_len = queue_len # Set initial task space to predefined calibrated task initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True) # Single task version (as the original paper) self.cur_mins = initial_mean self.cur_maxs = initial_mean self.cur_mins = np.array(self.cur_mins, dtype=np.float32) # current min bounds self.cur_maxs = np.array(self.cur_maxs, dtype=np.float32) # current max bounds self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32) self.task_space.seed(self.seed) # Init queues, one per task space dimension self.min_queues = [ deque(maxlen=self.window_len) for _ in range(self.nb_dims) ] self.max_queues = [ deque(maxlen=self.window_len) for _ in range(self.nb_dims) ] # Boring book-keeping self.episode_nb = 0 self.bk = { 'task_space': [(self.cur_mins.copy(), self.cur_maxs.copy())], 'episodes': [] } def episodic_update(self, task, reward, is_success): self.episode_nb += 1 # check for updates for i, (min_q, max_q, cur_min, cur_max) in enumerate( zip(self.min_queues, self.max_queues, self.cur_mins, self.cur_maxs)): if task[i] == cur_min: # if the proposed task has the i^th dimension set to min boundary min_q.append(reward) if len(min_q) == self.window_len: if np.mean( min_q ) >= self.max_reward_threshold: # decrease min boundary (inflate sampling space) self.cur_mins[i] = max( self.cur_mins[i] - self.step_size, self.mins[i]) elif np.mean( min_q ) <= self.min_reward_threshold: # increase min boundary (deflate sampling space) self.cur_mins[i] = min( self.cur_mins[i] + self.step_size, self.cur_maxs[i]) self.min_queues[i] = deque( maxlen=self.window_len) # reset queue if task[i] == cur_max: # if the proposed task has the i^th dimension set to max boundary max_q.append(reward) if len(max_q ) == self.window_len: # queue is full, time to update if np.mean( max_q ) >= self.max_reward_threshold: # increase max boundary self.cur_maxs[i] = min( self.cur_maxs[i] + self.step_size, self.maxs[i]) elif np.mean( max_q ) <= self.min_reward_threshold: # decrease max boundary self.cur_maxs[i] = max( self.cur_maxs[i] - self.step_size, self.cur_mins[i]) self.max_queues[i] = deque( maxlen=self.window_len) # reset queue prev_cur_mins, prev_cur_maxs = self.bk['task_space'][-1] if (prev_cur_mins != self.cur_mins).any() or ( prev_cur_maxs != self.cur_maxs).any(): # were boundaries changed ? self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32) self.task_space.seed(self.seed) # book-keeping only if boundaries were updates self.bk['task_space'].append( (self.cur_mins.copy(), self.cur_maxs.copy())) self.bk['episodes'].append(self.episode_nb) def sample_task(self): new_task = self.non_exploratory_task_sampling()["task"] if self.random_state.random( ) < self.bound_sampling_p: # set random dimension to min or max bound idx = self.random_state.randint(0, self.nb_dims) is_min_max_capped = np.array([ self.cur_mins[idx] == self.mins[idx], self.cur_maxs[idx] == self.maxs[idx] ]) if not is_min_max_capped.all( ): # both min and max bounds can increase, choose extremum randomly if self.random_state.random( ) < 0.5: # skip min bound if already new_task[idx] = self.cur_mins[idx] else: new_task[idx] = self.cur_maxs[idx] elif not is_min_max_capped[0]: new_task[idx] = self.cur_mins[idx] elif not is_min_max_capped[1]: new_task[idx] = self.cur_maxs[idx] return new_task def non_exploratory_task_sampling(self): return { "task": self.task_space.sample(), "infos": { "bk_index": len(self.bk[list(self.bk.keys())[0]]) - 1, "task_infos": None } }
class ALPGMM(): def __init__(self, mins, maxs, seed=None, params=dict()): self.seed = seed if not seed: self.seed = np.random.randint(42, 424242) np.random.seed(self.seed) # Task space boundaries self.mins = np.array(mins) self.maxs = np.array(maxs) # Range of number of Gaussians to try when fitting the GMM self.potential_ks = np.arange( 2, 11, 1) if "potential_ks" not in params else params["potential_ks"] # Restart new fit by initializing with last fit self.warm_start = False if "warm_start" not in params else params[ "warm_start"] # Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians. self.gmm_fitness_fun = "aic" if "gmm_fitness_fun" not in params else params[ "gmm_fitness_fun"] # Number of Expectation-Maximization trials when fitting self.nb_em_init = 1 if "nb_em_init" not in params else params[ 'nb_em_init'] # Number of episodes between two fit of the GMM self.fit_rate = 250 if "fit_rate" not in params else params['fit_rate'] self.nb_random = self.fit_rate # Number of bootstrapping episodes # Ratio of randomly sampled tasks VS tasks sampling using GMM self.random_task_ratio = 0.2 if "random_task_ratio" not in params else params[ "random_task_ratio"] self.random_task_generator = Box(self.mins, self.maxs, dtype=np.float32) self.random_task_generator.seed(self.seed) # Maximal number of episodes to account for when computing ALP alp_max_size = None if "alp_max_size" not in params else params[ "alp_max_size"] alp_buffer_size = 500 if "alp_buffer_size" not in params else params[ "alp_buffer_size"] # Init ALP computer self.alp_computer = EmpiricalALPComputer(len(mins), max_size=alp_max_size, buffer_size=alp_buffer_size) self.tasks = [] self.alps = [] self.tasks_alps = [] # Init GMMs self.potential_gmms = [self.init_gmm(k) for k in self.potential_ks] self.gmm = None # Boring book-keeping self.bk = { 'weights': [], 'covariances': [], 'means': [], 'tasks_alps': [], 'tasks_lps': [], 'episodes': [], 'tasks_origin': [] } def init_gmm(self, nb_gaussians): return GMM(n_components=nb_gaussians, covariance_type='full', random_state=self.seed, warm_start=self.warm_start, n_init=self.nb_em_init) def get_nb_gmm_params(self, gmm): # assumes full covariance # see https://stats.stackexchange.com/questions/229293/the-number-of-parameters-in-gaussian-mixture-model nb_gmms = gmm.get_params()['n_components'] d = len(self.mins) params_per_gmm = (d * d - d) / 2 + 2 * d + 1 return nb_gmms * params_per_gmm - 1 def update(self, task, reward): self.tasks.append(task) is_update_time = False # Compute corresponding ALP alp, lp = self.alp_computer.compute_alp(task, reward) self.alps.append(alp) # Concatenate task vector with ALP dimension self.tasks_alps.append(np.array(task.tolist() + [self.alps[-1]])) if len(self.tasks ) >= self.nb_random: # If initial bootstrapping is done if (len(self.tasks) % self.fit_rate) == 0: # Time to fit is_update_time = True # 1 - Retrieve last <fit_rate> (task, reward) pairs cur_tasks_alps = np.array(self.tasks_alps[-self.fit_rate:]) # 2 - Fit batch of GMMs with varying number of Gaussians self.potential_gmms = [ g.fit(cur_tasks_alps) for g in self.potential_gmms ] # 3 - Compute fitness and keep best GMM fitnesses = [] if self.gmm_fitness_fun == 'bic': # Bayesian Information Criterion fitnesses = [ m.bic(cur_tasks_alps) for m in self.potential_gmms ] elif self.gmm_fitness_fun == 'aic': # Akaike Information Criterion fitnesses = [ m.aic(cur_tasks_alps) for m in self.potential_gmms ] elif self.gmm_fitness_fun == 'aicc': # Modified AIC n = self.fit_rate fitnesses = [] for l, m in enumerate(self.potential_gmms): k = self.get_nb_gmm_params(m) penalty = (2 * k * (k + 1)) / (n - k - 1) fitnesses.append(m.aic(cur_tasks_alps) + penalty) else: raise NotImplementedError exit(1) self.gmm = self.potential_gmms[np.argmin(fitnesses)] # book-keeping self.bk['weights'].append(self.gmm.weights_.copy()) self.bk['covariances'].append(self.gmm.covariances_.copy()) self.bk['means'].append(self.gmm.means_.copy()) self.bk['tasks_alps'] = self.tasks_alps self.bk['tasks_lps'].append(lp) self.bk['episodes'].append(len(self.tasks)) return is_update_time def sample_task(self): task_origin = None if (len(self.tasks) < self.nb_random) or (np.random.random() < self.random_task_ratio): # Random task sampling new_task = self.random_task_generator.sample() task_origin = -1 # -1 = task originates from random sampling else: # ALP-based task sampling # 1 - Retrieve the mean ALP value of each Gaussian in the GMM self.alp_means = [] for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_, self.gmm.weights_): self.alp_means.append(pos[-1]) # 2 - Sample Gaussian proportionally to its mean ALP idx = proportional_choice(self.alp_means, eps=0.0) task_origin = idx # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension new_task = np.random.multivariate_normal( self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1] new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) # boring book-keeping self.bk['tasks_origin'].append(task_origin) return new_task def dump(self, dump_dict): dump_dict.update(self.bk) return dump_dict
def test_squashed_gaussian(self): """Tests the SquashedGaussian ActionDistribution for all frameworks.""" input_space = Box(-2.0, 2.0, shape=(2000, 10)) input_space.seed(42) low, high = -2.0, 1.0 for fw, sess in framework_iterator(session=True): cls = SquashedGaussian if fw != "torch" else TorchSquashedGaussian # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, input_space.shape, fw=fw, sess=sess, bounds=(low, high)) # Batch of size=n and deterministic. inputs = input_space.sample() means, _ = np.split(inputs, 2, axis=-1) squashed_distribution = cls(inputs, {}, low=low, high=high) expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low # Sample n times, expect always mean value (deterministic draw). out = squashed_distribution.deterministic_sample() check(out, expected) # Batch of size=n and non-deterministic -> expect roughly the mean. inputs = input_space.sample() means, log_stds = np.split(inputs, 2, axis=-1) squashed_distribution = cls(inputs, {}, low=low, high=high) expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low values = squashed_distribution.sample() if sess: values = sess.run(values) else: values = values.numpy() self.assertTrue(np.max(values) <= high) self.assertTrue(np.min(values) >= low) check(np.mean(values), expected.mean(), decimals=1) # Test log-likelihood outputs. sampled_action_logp = squashed_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: sampled_action_logp = sess.run(sampled_action_logp) else: sampled_action_logp = sampled_action_logp.numpy() # Convert to parameters for distr. stds = np.exp( np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)) # Unsquash values, then get log-llh from regular gaussian. # atanh_in = np.clip((values - low) / (high - low) * 2.0 - 1.0, # -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER) normed_values = (values - low) / (high - low) * 2.0 - 1.0 save_normed_values = np.clip(normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER) unsquashed_values = np.arctanh(save_normed_values) log_prob_unsquashed = np.sum( np.log(norm.pdf(unsquashed_values, means, stds)), -1) log_prob = log_prob_unsquashed - np.sum( np.log(1 - np.tanh(unsquashed_values)**2), axis=-1) check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05) # NN output. means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]]) log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]]) squashed_distribution = cls( inputs=np.concatenate([means, log_stds], axis=-1), model={}, low=low, high=high, ) # Convert to parameters for distr. stds = np.exp(log_stds) # Values to get log-likelihoods for. values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]]) # Unsquash values, then get log-llh from regular gaussian. unsquashed_values = np.arctanh((values - low) / (high - low) * 2.0 - 1.0) log_prob_unsquashed = np.sum( np.log(norm.pdf(unsquashed_values, means, stds)), -1) log_prob = log_prob_unsquashed - np.sum( np.log(1 - np.tanh(unsquashed_values)**2), axis=-1) outs = squashed_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: outs = sess.run(outs) check(outs, log_prob, decimals=4)
def test_diag_gaussian(self): """Tests the DiagGaussian ActionDistribution for all frameworks.""" input_space = Box(-2.0, 1.0, shape=(2000, 10)) input_space.seed(42) for fw, sess in framework_iterator(session=True): cls = DiagGaussian if fw != "torch" else TorchDiagGaussian # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, input_space.shape, fw=fw, sess=sess) # Batch of size=n and deterministic. inputs = input_space.sample() means, _ = np.split(inputs, 2, axis=-1) diag_distribution = cls(inputs, {}) expected = means # Sample n times, expect always mean value (deterministic draw). out = diag_distribution.deterministic_sample() check(out, expected) # Batch of size=n and non-deterministic -> expect roughly the mean. inputs = input_space.sample() means, log_stds = np.split(inputs, 2, axis=-1) diag_distribution = cls(inputs, {}) expected = means values = diag_distribution.sample() if sess: values = sess.run(values) else: values = values.numpy() check(np.mean(values), expected.mean(), decimals=1) # Test log-likelihood outputs. sampled_action_logp = diag_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: sampled_action_logp = sess.run(sampled_action_logp) else: sampled_action_logp = sampled_action_logp.numpy() # NN output. means = np.array( [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]], dtype=np.float32, ) log_stds = np.array( [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]], dtype=np.float32, ) diag_distribution = cls(inputs=np.concatenate([means, log_stds], axis=-1), model={}) # Convert to parameters for distr. stds = np.exp(log_stds) # Values to get log-likelihoods for. values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]]) # get log-llh from regular gaussian. log_prob = np.sum(np.log(norm.pdf(values, means, stds)), -1) outs = diag_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: outs = sess.run(outs) check(outs, log_prob, decimals=4)
def test_forward_obs_only() -> None: """ Test forward() when each task-specific output head multiplies the shared trunk output by some constant factor, and the task index is not included in the input. """ # Set up case. dim = SETTINGS["obs_dim"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim num_shared_layers = 1 include_task_index = False # Construct network and set weights of each output head explicitly. We want to make # it so that each layer in the shared trunk computes an identity function (plus the # nonlinearity), f_i(x) = x * i + i (with broadcasted operations), where the i-th # output head is f_i. network = MultiTaskTrunkNetwork( input_size=dim, output_size=dim, num_tasks=SETTINGS["num_tasks"], num_shared_layers=num_shared_layers, num_task_layers=SETTINGS["num_task_layers"], hidden_size=hidden_size, downscale_last_layer=True, device=SETTINGS["device"], ) # Set shared trunk weights. trunk_state_dict = network.trunk.state_dict() trunk_state_dict["0.0.weight"] = torch.Tensor(np.identity(hidden_size)) trunk_state_dict["0.0.bias"] = torch.zeros(hidden_size) network.trunk.load_state_dict(trunk_state_dict) # Set task-specific weights. for i in range(SETTINGS["num_tasks"]): # Set weights. state_dict = network.output_heads[i].state_dict() state_dict["0.0.weight"] = torch.Tensor(i * np.identity(hidden_size)) state_dict["0.0.bias"] = i * torch.ones(hidden_size) network.output_heads[i].load_state_dict(state_dict) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=SETTINGS["num_processes"], obs_space=observation_subspace, num_tasks=SETTINGS["num_tasks"], ) obs_only = obs[:, :dim] # Get output of network. output = network(obs_only, task_indices) # Construct expected action distribution of network. expected_output_list = [] for i, task_index in enumerate(task_indices): expected_output_list.append( torch.tanh(obs_only[i]) * task_index + task_index) expected_output = torch.stack(expected_output_list) # Test output of network. assert torch.allclose(output, expected_output)
def test_multi_action_distribution(self): """Tests the MultiActionDistribution (across all frameworks).""" batch_size = 1000 input_space = Tuple([ Box(-10.0, 10.0, shape=(batch_size, 4)), Box( -2.0, 2.0, shape=( batch_size, 6, ), ), Dict({"a": Box(-1.0, 1.0, shape=(batch_size, 4))}), ]) input_space.seed(42) std_space = Box( -0.05, 0.05, shape=( batch_size, 3, ), ) std_space.seed(42) low, high = -1.0, 1.0 value_space = Tuple([ Box(0, 3, shape=(batch_size, ), dtype=np.int32), Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32), Dict({"a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)}), ]) value_space.seed(42) for fw, sess in framework_iterator(session=True): if fw == "torch": cls = TorchMultiActionDistribution child_distr_cls = [ TorchCategorical, TorchDiagGaussian, partial(TorchBeta, low=low, high=high), ] else: cls = MultiActionDistribution child_distr_cls = [ Categorical, DiagGaussian, partial(Beta, low=low, high=high), ] inputs = list(input_space.sample()) distr = cls( np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4], ) # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 # Sample deterministically. expected_det = [ np.argmax(inputs[0], axis=-1), inputs[1][:, :3], # [:3]=Mean values. # Mean for a Beta distribution: # 1 / [1 + (beta/alpha)] * range + low (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) * (high - low) + low, ] out = distr.deterministic_sample() if sess: out = sess.run(out) check(out[0], expected_det[0]) check(out[1], expected_det[1]) check(out[2]["a"], expected_det[2]) # Stochastic sampling -> expect roughly the mean. inputs = list(input_space.sample()) # Fix categorical inputs (not needed for distribution itself, but # for our expectation calculations). inputs[0] = softmax(inputs[0], -1) # Fix std inputs (shouldn't be too large for this test). inputs[1][:, 3:] = std_space.sample() # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 distr = cls( np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4], ) expected_mean = [ np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)), inputs[1][:, :3], # [:3]=Mean values. # Mean for a Beta distribution: # 1 / [1 + (beta/alpha)] * range + low (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) * (high - low) + low, ] out = distr.sample() if sess: out = sess.run(out) out = list(out) if fw == "torch": out[0] = out[0].numpy() out[1] = out[1].numpy() out[2]["a"] = out[2]["a"].numpy() check(np.mean(out[0]), expected_mean[0], decimals=1) check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1) check(np.mean(out[2]["a"], 0), np.mean(expected_mean[2], 0), decimals=1) # Test log-likelihood outputs. # Make sure beta-values are within 0.0 and 1.0 for the numpy # calculation (which doesn't have scaling). inputs = list(input_space.sample()) # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 distr = cls( np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4], ) inputs[0] = softmax(inputs[0], -1) values = list(value_space.sample()) log_prob_beta = np.log( beta.pdf(values[2]["a"], inputs[2]["a"][:, :2], inputs[2]["a"][:, 2:])) # Now do the up-scaling for [2] (beta values) to be between # low/high. values[2]["a"] = values[2]["a"] * (high - low) + low inputs[1][:, 3:] = np.exp(inputs[1][:, 3:]) expected_log_llh = np.sum( np.concatenate( [ np.expand_dims( np.log([ i[values[0][j]] for j, i in enumerate(inputs[0]) ]), -1, ), np.log( norm.pdf(values[1], inputs[1][:, :3], inputs[1][:, 3:])), log_prob_beta, ], -1, ), -1, ) values[0] = np.expand_dims(values[0], -1) if fw == "torch": values = tree.map_structure(lambda s: torch.Tensor(s), values) # Test all flattened input. concat = np.concatenate(tree.flatten(values), -1).astype(np.float32) out = distr.logp(concat) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15) # Test structured input. out = distr.logp(values) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15) # Test flattened input. out = distr.logp(tree.flatten(values)) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15)
class GymEnvWrapper(gym.Env): """Wraps an OpenAI Gym environment to be able to modify its dimensions corresponding to MDP Playground. The documentation for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py. Currently supported dimensions: transition noise (discrete) reward delay reward noise Also supports wrapping with AtariPreprocessing from OpenAI Gym or wrap_deepmind from Ray Rllib. """ # Should not be a gym.Wrapper because 1) gym.Wrapper has member variables observation_space and action_space while here with irrelevant_features we would have multiple observation_spaces and this could cause conflict with code that assumes any subclass of gym.Wrapper should have these member variables. # However, it _should_ be at least a gym.Env # Does it need to be a subclass of base_class because some external code # may check if it's an AtariEnv, for instance, and do further stuff based # on that? def __init__(self, env, **config): self.config = copy.deepcopy(config) # self.env = config["env"] self.env = env seed_int = None if "seed" in config: seed_int = config["seed"] self.seed(seed_int) # seed # IMP Move below code from here to seed()? Because if seed is called # during the run of an env, the expectation is that all obs., act. space, # etc. seeds are set? Only Atari in Gym seems to do something similar, the # others I saw there don't seem to set seed for obs., act. spaces. self.env.seed( seed_int ) # seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env? obs_space_seed = self.np_random.randint(sys.maxsize) # random act_space_seed = self.np_random.randint(sys.maxsize) # random self.env.observation_space.seed(obs_space_seed) # seed self.env.action_space.seed(act_space_seed) # seed # if "dummy_eval" in config: #hack # del config["dummy_eval"] if "delay" in config: self.delay = config["delay"] assert config["delay"] >= 0 self.reward_buffer = [0.0] * (self.delay) else: self.delay = 0 if "transition_noise" in config: self.transition_noise = config["transition_noise"] if config["state_space_type"] == "continuous": assert callable(self.transition_noise), ( "transition_noise must be a function when env is continuous, it was of type:" + str(type(self.transition_noise))) else: assert self.transition_noise <= 1.0 and self.transition_noise >= 0.0, ( "transition_noise must be a value in [0.0, 1.0] when env is discrete, it was:" + str(self.transition_noise)) else: if config["state_space_type"] == "discrete": self.transition_noise = 0.0 else: self.transition_noise = lambda a: 0.0 if "reward_noise" in config: if callable(config["reward_noise"]): self.reward_noise = config["reward_noise"] else: reward_noise_std = config["reward_noise"] self.reward_noise = lambda a: a.normal(0, reward_noise_std) else: self.reward_noise = None if ("wrap_deepmind_ray" in config and config["wrap_deepmind_ray"]): # hack ##TODO remove? self.env = wrap_deepmind(self.env, dim=42, framestack=True) elif "atari_preprocessing" in config and config["atari_preprocessing"]: self.frame_skip = 4 # default for AtariPreprocessing if "frame_skip" in config: self.frame_skip = config["frame_skip"] self.grayscale_obs = False if "grayscale_obs" in config: self.grayscale_obs = config["grayscale_obs"] # Use AtariPreprocessing with frame_skip # noop_max set to 1 because we want to keep the vanilla env as # deterministic as possible and setting it 0 was not allowed. ##TODO # noop_max=0 is poosible in new Gym version, so update Gym version. self.env = AtariPreprocessing( self.env, frame_skip=self.frame_skip, grayscale_obs=self.grayscale_obs, noop_max=1, ) print("self.env.noop_max set to: ", self.env.noop_max) if "irrelevant_features" in config: # self.irrelevant_features = config["irrelevant_features"] irr_toy_env_conf = config["irrelevant_features"] if "seed" not in irr_toy_env_conf: irr_toy_env_conf["seed"] = self.np_random.randint( sys.maxsize) # random self.irr_toy_env = RLToyEnv(**irr_toy_env_conf) if config["state_space_type"] == "discrete": self.action_space = Tuple( (self.env.action_space, self.irr_toy_env.action_space)) self.observation_space = Tuple( (self.env.observation_space, self.irr_toy_env.observation_space) ) # TODO for image observations, concatenate to 1 obs. space here and in step() and reset()? else: # TODO Check the test case added for cont. irr features case and code for it in run_experiments.py. env_obs_low = self.env.observation_space.low env_obs_high = self.env.observation_space.high env_obs_dtype = env_obs_low.dtype env_obs_shape = env_obs_low.shape irr_env_obs_low = self.irr_toy_env.observation_space.low irr_env_obs_high = self.irr_toy_env.observation_space.high irr_env_obs_dtype = self.irr_toy_env.observation_space.low.dtype assert env_obs_dtype == irr_env_obs_dtype, ( "Datatypes of base env and irrelevant toy env should match. Were: " + str(env_obs_dtype) + ", " + str(irr_env_obs_dtype)) ext_low = np.concatenate((env_obs_low, irr_env_obs_low)) ext_high = np.concatenate((env_obs_high, irr_env_obs_high)) self.observation_space = Box(low=ext_low, high=ext_high, dtype=env_obs_dtype) env_act_low = self.env.action_space.low env_act_high = self.env.action_space.high env_act_dtype = env_act_low.dtype self.env_act_shape = env_act_low.shape assert (len(self.env_act_shape) == 1 ), "Length of shape of action space should be 1." irr_env_act_low = self.irr_toy_env.action_space.low irr_env_act_high = self.irr_toy_env.action_space.high irr_env_act_dtype = irr_env_act_low.dtype # assert env_obs_dtype == env_act_dtype, "Datatypes of obs. and act. of # base env should match. Were: " + str(env_obs_dtype) + ", " + # str(env_act_dtype) #TODO Apparently, observations are np.float64 and # actions np.float32 for Mujoco. ext_low = np.concatenate((env_act_low, irr_env_act_low)) ext_high = np.concatenate((env_act_high, irr_env_act_high)) self.action_space = Box( low=ext_low, high=ext_high, dtype=env_act_dtype ) # TODO Use BoxExtended here and above? self.observation_space.seed(obs_space_seed) # seed self.action_space.seed(act_space_seed) # seed else: self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.total_episodes = 0 # if "action_loss_weight" in config: #hack # del config["action_loss_weight"] # if "action_space_max" in config: #hack # action_space_max = config["action_space_max"] # del config["action_space_max"] # if "time_unit" in config: #hack # time_unit = config["time_unit"] # del config["time_unit"] # if "dummy_seed" in config: #hack # del config["dummy_seed"] super(GymEnvWrapper, self).__init__() # if "action_space_max" in locals(): # print("Setting Mujoco self.action_space.low, self.action_space.high from:", self.action_space.low, self.action_space.high) # self.action_space.low *= action_space_max # self.action_space.high *= action_space_max # print("to:", self.action_space.low, self.action_space.high) # if base_class == HalfCheetahEnv and action_space_max >= 4: #hack # self.model.opt.timestep /= 2 # 0.005 # self.frame_skip *= 2 # print("Setting Mujoco timestep to", self.model.opt.timestep, "half of the usual to avoid instabilities. At the same time action repeat increased to twice its usual.") # if "time_unit" in locals(): #hack In HalfCheetah, this is needed because the reward function is dependent on the time_unit because it depends on velocity achieved which depends on amount of time torque was applied. In Pusher, Reacher, it is also needed because the reward is similar to the distance from current position to goal at _each_ step, which means if we calculate the reward multiple times in the same amount of "real" time, we'd need to average out the reward the more times we calculate the reward in the same amount of "real" time (i.e., when we have shorter acting timesteps). This is not the case with the toy enviroments because there the reward is amount of distance moved from current position to goal in the current timestep, so it's dependent on "real" time and not on acting timesteps. # self.frame_skip *= time_unit # self.frame_skip = int(self.frame_skip) # self._ctrl_cost_weight *= time_unit # self._forward_reward_weight *= time_unit # print("Setting Mujoco self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight to", self.frame_skip, self._ctrl_cost_weight, self._forward_reward_weight, "corresponding to time_unit in config.") def step(self, action): # next_state, reward, done, info = super(GymEnvWrapper, self).step(action) self.total_transitions_episode += 1 if (self.config["state_space_type"] == "discrete" and self.transition_noise > 0.0): probs = (np.ones(shape=(self.env.action_space.n, )) * self.transition_noise / (self.env.action_space.n - 1)) probs[action] = 1 - self.transition_noise old_action = action action = int( self.np_random.choice(self.env.action_space.n, size=1, p=probs)) # random if old_action != action: # print("NOISE inserted", old_action, action) self.total_noisy_transitions_episode += 1 else: # cont. envs pass # TODO # self.total_abs_noise_in_transition_episode += np.abs(noise_in_transition) if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": next_state, reward, done, info = self.env.step(action[0]) next_state_irr, _, done_irr, _ = self.irr_toy_env.step( action[1]) next_state = tuple([next_state, next_state_irr]) else: next_state, reward, done, info = self.env.step( action[:self.env_act_shape[0]]) next_state_irr, _, done_irr, _ = self.irr_toy_env.step( action[self.env_act_shape[0]:]) next_state = np.concatenate((next_state, next_state_irr)) else: next_state, reward, done, info = self.env.step(action) if done: # if episode is finished return the rewards that were delayed and not # handed out before ##TODO add test case for this reward = np.sum(self.reward_buffer) else: self.reward_buffer.append(reward) old_reward = reward reward = self.reward_buffer[0] # print("rewards:", self.reward_buffer, old_reward, reward) del self.reward_buffer[0] # random ###TODO Would be better to parameterise this in terms of state, # action and time_step as well. Would need to change implementation to # have a queue for the rewards achieved and then pick the reward that was # generated delay timesteps ago. noise_in_reward = (self.reward_noise(self.np_random) if self.reward_noise else 0) self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward) self.total_reward_episode += reward reward += noise_in_reward return next_state, reward, done, info def reset(self): # on episode "end" stuff (to not be invoked when reset() called when # self.total_episodes = 0; end is in quotes because it may not be a true # episode end reached by reaching a terminal state, but reset() may have # been called in the middle of an episode): if not self.total_episodes == 0: print( "Noise stats for previous episode num.: " + str(self.total_episodes) + " (total abs. noise in rewards, total abs. noise in transitions, total reward, total noisy transitions, total transitions): " + str(self.total_abs_noise_in_reward_episode) + " " + str(self.total_abs_noise_in_transition_episode) + " " + str(self.total_reward_episode) + " " + str(self.total_noisy_transitions_episode) + " " + str(self.total_transitions_episode)) # on episode start stuff: self.reward_buffer = [0.0] * (self.delay) self.total_episodes += 1 self.total_abs_noise_in_reward_episode = 0 self.total_abs_noise_in_transition_episode = ( 0 # only present in continuous spaces ) self.total_noisy_transitions_episode = 0 # only present in discrete spaces self.total_reward_episode = 0 self.total_transitions_episode = 0 if "irrelevant_features" in self.config: if self.config["state_space_type"] == "discrete": reset_state = self.env.reset() reset_state_irr = self.irr_toy_env.reset() reset_state = tuple([reset_state, reset_state_irr]) else: reset_state = self.env.reset() reset_state_irr = self.irr_toy_env.reset() reset_state = np.concatenate((reset_state, reset_state_irr)) else: reset_state = self.env.reset() return reset_state # return super(GymEnvWrapper, self).reset() def seed(self, seed=None): """Initialises the Numpy RNG for the environment by calling a utility for this in Gym. Parameters ---------- seed : int seed to initialise the np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it. Returns ------- int The seed returned by Gym """ # If seed is None, you get a randomly generated seed from gym.utils... self.np_random, self.seed_ = gym.utils.seeding.np_random( seed) # random print("Env SEED set to: " + str(seed) + ". Returned seed from Gym: " + str(self.seed_)) return self.seed_
class Imagination: def __init__(self, model, n_actors, horizon, measure): """ Imaginary MDP Args: model: models.Model object n_actors: number of parallel episodes horizon: length of the episode measure: the reward function """ self.model = model self.n_actors = n_actors self.horizon = horizon self.measure = measure self.ensemble_size = model.ensemble_size self.action_space = Box(low=-1.0, high=1.0, shape=(n_actors, self.model.d_action), dtype=np.float32) self.action_space.seed(np.random.randint(np.iinfo(np.uint32).max)) self.init_state = None self.states = None self.steps = None def step(self, actions): n_act = self.n_actors es = self.ensemble_size actions = actions.to(self.model.device) # get next state distribution for all models with torch.no_grad(): next_state_means, next_state_vars = self.model.forward_all(self.states, actions) # shape: (n_actors, ensemble_size, d_state) i = torch.arange(n_act).to(self.model.device) j = torch.randint(es, size=(n_act,)).to(self.model.device) next_states = self.model.sample(next_state_means[i, j], next_state_vars[i, j]) # shape: (n_actors, d_state) #print (next_state_vars[i, j]) if torch.any(torch.isnan(next_states)).item(): warnings.warn("NaN in sampled next states!") if torch.any(torch.isinf(next_states)).item(): warnings.warn("Inf in sampled next states!") # compute measure measures = self.measure(self.states, # shape: (n_actors, d_state) actions, # shape: (n_actors, d_action) next_states, # shape: (n_actors, d_state) next_state_means, # shape: (n_actors, ensemble_size, d_state) next_state_vars, # shape: (n_actors, ensemble_size, d_state) self.model) self.states = next_states self.steps += 1 done = False if self.steps >= self.horizon: done = True return next_states, measures, done, {} def reset(self): states = torch.from_numpy(self.init_state).float() states = states.unsqueeze(0) states = states.repeat(self.n_actors, 1) states = states.to(self.model.device) self.steps = 0 self.states = states # shape: (n_actors, d_state) return states def update_init_state(self, state): self.init_state = state
def test_forward_multiple() -> None: """ Test forward() when none of the layers are fully shared. The function computed by the network should be: - f(x) = 3 * tanh(2 * tanh(x + 1) + 2) + 3 for task 0 - f(x) = -3 * tanh(-2 * tanh(x + 1) - 2) - 3 for task 1 - f(x) = -3 * tanh(1/2 * tanh(-x - 1) + 1/2) - 3 for task 2 - f(x) = 3 * tanh(-2 * tanh(-x - 1) - 2) + 3 for task 3 """ # Set up case. dim = BASE_SETTINGS["obs_dim"] + BASE_SETTINGS["num_tasks"] observation_subspace = Box(low=-np.inf, high=np.inf, shape=(BASE_SETTINGS["obs_dim"], )) observation_subspace.seed(DEFAULT_SETTINGS["seed"]) hidden_size = dim # Construct network. network = BaseMultiTaskSplittingNetwork( input_size=dim, output_size=dim, num_tasks=BASE_SETTINGS["num_tasks"], num_layers=BASE_SETTINGS["num_layers"], hidden_size=hidden_size, device=BASE_SETTINGS["device"], ) # Split the network at the second layer. Tasks 0 and 1 stay assigned to the original # copy and tasks 2 and 3 are assigned to the new copy. network.split(0, 0, [0, 1], [2, 3]) network.split(1, 0, [0, 2], [1, 3]) network.split(1, 0, [0], [2]) network.split(2, 0, [0, 3], [1, 2]) # Set network weights. state_dict = network.state_dict() for i in range(BASE_SETTINGS["num_layers"]): for j in range(3): weight_name = "regions.%d.%d.0.weight" % (i, j) bias_name = "regions.%d.%d.0.bias" % (i, j) if weight_name not in state_dict: continue if j == 0: state_dict[weight_name] = torch.Tensor( (i + 1) * np.identity(dim)) state_dict[bias_name] = torch.Tensor((i + 1) * np.ones(dim)) elif j == 1: state_dict[weight_name] = torch.Tensor(-(i + 1) * np.identity(dim)) state_dict[bias_name] = torch.Tensor(-(i + 1) * np.ones(dim)) elif j == 2: state_dict[weight_name] = torch.Tensor(1 / (i + 1) * np.identity(dim)) state_dict[bias_name] = torch.Tensor(1 / (i + 1) * np.ones(dim)) else: raise NotImplementedError network.load_state_dict(state_dict) # Construct batch of observations concatenated with one-hot task vectors. obs, task_indices = get_obs_batch( batch_size=BASE_SETTINGS["num_processes"], obs_space=observation_subspace, num_tasks=BASE_SETTINGS["num_tasks"], ) # Get output of network. output = network(obs, task_indices) # Computed expected output of network. expected_output = torch.zeros(obs.shape) for i, (ob, task) in enumerate(zip(obs, task_indices)): if task == 0: expected_output[i] = 3 * torch.tanh(2 * torch.tanh(ob + 1) + 2) + 3 elif task == 1: expected_output[i] = -3 * torch.tanh(-2 * torch.tanh(ob + 1) - 2) - 3 elif task == 2: expected_output[i] = ( -3 * torch.tanh(1 / 2 * torch.tanh(-ob - 1) + 1 / 2) - 3) elif task == 3: expected_output[i] = 3 * torch.tanh(-2 * torch.tanh(-ob - 1) - 2) + 3 else: raise NotImplementedError # Test output of network. assert torch.allclose(output, expected_output)