class TestNormalizer(unittest.TestCase): def setUp(self): self.n_samples = 1000 self.d_state = 10 self.d_action = 5 self.normalizer = TransitionNormalizer() self.states = [np.random.random(self.d_state) for _ in range(self.n_samples)] self.actions = [np.random.random(self.d_action) for _ in range(self.n_samples)] self.next_states = [np.random.random(self.d_state) for _ in range(self.n_samples)] self.state_deltas = [next_state - state for state, next_state in zip(self.next_states, self.states)] for state, action, state_delta in zip(self.states, self.actions, self.state_deltas): state, action, state_delta = torch.from_numpy(state).float().clone(), torch.from_numpy( action).float().clone(), torch.from_numpy(state_delta).float().clone() self.normalizer.update(state, action, state_delta) def test_stats(self): self.assertTrue(np.allclose(np.array(self.states).mean(axis=0), self.normalizer.state_mean)) self.assertTrue(np.allclose(np.array(self.actions).mean(axis=0), self.normalizer.action_mean)) self.assertTrue(np.allclose(np.array(self.state_deltas).mean(axis=0), self.normalizer.state_delta_mean)) self.assertTrue(np.allclose(np.array(self.states).std(axis=0), self.normalizer.state_stdev)) self.assertTrue(np.allclose(np.array(self.actions).std(axis=0), self.normalizer.action_stdev)) self.assertTrue(np.allclose(np.array(self.state_deltas).std(axis=0), self.normalizer.state_delta_stdev)) def test_tensor_shape_handling(self): x = torch.rand(self.d_state) a = self.normalizer.normalize_states(x) y = x.clone() y = y.unsqueeze(0).unsqueeze(0).unsqueeze(0) b = self.normalizer.normalize_states(y) self.assertTrue(np.allclose(a, b))
def setUp(self): self.n_samples = 1000 self.d_state = 10 self.d_action = 5 self.normalizer = TransitionNormalizer() self.states = [np.random.random(self.d_state) for _ in range(self.n_samples)] self.actions = [np.random.random(self.d_action) for _ in range(self.n_samples)] self.next_states = [np.random.random(self.d_state) for _ in range(self.n_samples)] self.state_deltas = [next_state - state for state, next_state in zip(self.next_states, self.states)] for state, action, state_delta in zip(self.states, self.actions, self.state_deltas): state, action, state_delta = torch.from_numpy(state).float().clone(), torch.from_numpy( action).float().clone(), torch.from_numpy(state_delta).float().clone() self.normalizer.update(state, action, state_delta)
def do_exploitation(seed, normalize_data, n_exploration_steps, buffer_file, ensemble_size, benchmark_utility, _log, _run): if len(buffer_file): with gzip.open(buffer_file, 'rb') as f: buffer = pickle.load(f) buffer.ensemble_size = ensemble_size else: env = get_env() env.seed(seed) atexit.register(lambda: env.close()) buffer = get_buffer() if normalize_data: normalizer = TransitionNormalizer() buffer.setup_normalizer(normalizer) state = env.reset() for step_num in range(1, n_exploration_steps + 1): action = env.action_space.sample() next_state, reward, done, info = env.step(action) buffer.add(state, action, next_state) if done: _log.info(f"step: {step_num}\tepisode complete") next_state = env.reset() state = next_state if benchmark_utility: return evaluate_utility(buffer=buffer) else: return evaluate_tasks(buffer=buffer, step_num=0)
def do_random_exploration(seed, normalize_data, n_exploration_steps, n_warm_up_steps, eval_freq, _log): env = get_env() env.seed(seed) atexit.register(lambda: env.close()) buffer = get_buffer() if normalize_data: normalizer = TransitionNormalizer() buffer.setup_normalizer(normalizer) average_performances = [] state = env.reset() for step_num in range(1, n_exploration_steps + 1): action = env.action_space.sample() next_state, reward, done, info = env.step(action) buffer.add(state, action, next_state) if done: _log.info(f"step: {step_num}\tepisode complete") next_state = env.reset() state = next_state time_to_evaluate = ((step_num % eval_freq) == 0) just_finished_warm_up = (step_num == n_warm_up_steps) if time_to_evaluate or just_finished_warm_up: average_performance = evaluate_tasks(buffer=buffer, step_num=step_num) average_performances.append(average_performance) checkpoint(buffer=buffer, step_num=n_exploration_steps) return max(average_performances)
def get_buffer(d_state, d_action, n_total_steps, normalize_data, device): data_buffer_size = n_total_steps buffer = Buffer(d_action=d_action, d_state=d_state, size=data_buffer_size) if normalize_data: buffer.setup_normalizer(TransitionNormalizer(d_state, d_action, device)) return buffer
def do_max_exploration(seed, action_noise_stdev, n_exploration_steps, n_warm_up_steps, model_train_freq, exploring_model_epochs, eval_freq, checkpoint_frequency, render, record, dump_dir, _config, _log, _run): env = get_env() env.seed(seed) atexit.register(lambda: env.close()) buffer = get_buffer() exploration_measure = get_utility_measure() if _config['normalize_data']: normalizer = TransitionNormalizer() buffer.setup_normalizer(normalizer) model = None mdp = None agent = None average_performances = [] if record: video_filename = f"{dump_dir}/exploration_0.mp4" state = env.reset(filename=video_filename) else: state = env.reset() for step_num in range(1, n_exploration_steps + 1): if step_num > n_warm_up_steps: action, mdp, agent, policy_value = act(state=state, agent=agent, mdp=mdp, buffer=buffer, model=model, measure=exploration_measure, mode='explore') _run.log_scalar("action_norm", np.sum(np.square(action)), step_num) _run.log_scalar("exploration_policy_value", policy_value, step_num) if action_noise_stdev: action = action + np.random.normal(scale=action_noise_stdev, size=action.shape) else: action = env.action_space.sample() next_state, reward, done, info = env.step(action) buffer.add(state, action, next_state) if step_num > n_warm_up_steps: _run.log_scalar( "experience_novelty", transition_novelty(state, action, next_state, model=model), step_num) if render: env.render() if done: _log.info(f"step: {step_num}\tepisode complete") agent = None mdp = None if record: new_video_filename = f"{dump_dir}/exploration_{step_num}.mp4" next_state = env.reset(filename=new_video_filename) _run.add_artifact(video_filename) video_filename = new_video_filename else: next_state = env.reset() state = next_state if step_num < n_warm_up_steps: continue episode_done = done train_at_end_of_episode = (model_train_freq is np.inf) time_to_update = ((step_num % model_train_freq) == 0) just_finished_warm_up = (step_num == n_warm_up_steps) if (train_at_end_of_episode and episode_done) or time_to_update or just_finished_warm_up: model = fit_model(buffer=buffer, n_epochs=exploring_model_epochs, step_num=step_num, mode='explore') # discard old solution and MDP as models changed mdp = None agent = None time_to_evaluate = ((step_num % eval_freq) == 0) if time_to_evaluate or just_finished_warm_up: average_performance = evaluate_tasks(buffer=buffer, step_num=step_num) average_performances.append(average_performance) time_to_checkpoint = ((step_num % checkpoint_frequency) == 0) if time_to_checkpoint: checkpoint(buffer=buffer, step_num=step_num) if record: _run.add_artifact(video_filename) return max(average_performances)
class Model(nn.Module): min_log_var = -5 max_log_var = -1 def __init__(self, dev_name, d_action, d_state, n_hidden, n_layers, ensemble_size, non_linearity='leaky_relu'): """ state space forward model. predicts mean and variance of next state given state and action i.e independent gaussians for each dimension of next state. using state and action, delta of state is computed. the mean of the delta is added to current state to get the mean of next state. there is a soft threshold on the output variance, forcing it to be in the same range as the variance of the training data. the thresholds are learnt in the form of bounds on variance and a small penalty is used to contract the distance between the lower and upper bounds. loss components: 1. minimize negative log-likelihood of data 2. (small weight) try to contract lower and upper bounds of variance Args: d_action (int): dimensionality of action d_state (int): dimensionality of state n_hidden (int): size or width of hidden layers n_layers (int): number of hidden layers (number of non-lineatities). should be >= 2 ensemble_size (int): number of models in the ensemble non_linearity (str): 'linear', 'swish' or 'leaky_relu' device (str): device of the model """ assert n_layers >= 2, "minimum depth of model is 2" super().__init__() layers = [] for lyr_idx in range(n_layers + 1): if lyr_idx == 0: lyr = EnsembleDenseLayer(d_action + d_state, n_hidden, ensemble_size, non_linearity=non_linearity) elif 0 < lyr_idx < n_layers: lyr = EnsembleDenseLayer(n_hidden, n_hidden, ensemble_size, non_linearity=non_linearity) elif lyr_idx == n_layers: lyr = EnsembleDenseLayer(n_hidden, d_state + d_state, ensemble_size, non_linearity='linear') layers.append(lyr) self.layers = nn.Sequential(*layers) device = torch.device(dev_name if torch.cuda.is_available() else 'cpu') self.to(device) self.device = device self.normalizer = None self.d_action = d_action self.d_state = d_state self.n_hidden = n_hidden self.n_layers = n_layers self.ensemble_size = ensemble_size def setup_normalizer(self, normalizer): # print(normalizer) self.normalizer = TransitionNormalizer() self.normalizer.set_state(normalizer.get_state()) def _pre_process_model_inputs(self, states, actions): states = states.to(self.device) actions = actions.to(self.device) if self.normalizer is None: return states, actions states = self.normalizer.normalize_states(states) actions = self.normalizer.normalize_actions(actions) return states, actions def _pre_process_model_targets(self, state_deltas): state_deltas = state_deltas.to(self.device) if self.normalizer is None: return state_deltas state_deltas = self.normalizer.normalize_state_deltas(state_deltas) return state_deltas def _post_process_model_outputs(self, delta_mean, var): # denormalize to return in raw state space if self.normalizer is not None: delta_mean = self.normalizer.denormalize_state_delta_means( delta_mean) var = self.normalizer.denormalize_state_delta_vars(var) return delta_mean, var def _propagate_network(self, states, actions): inp = torch.cat((states, actions), dim=2) op = self.layers(inp) delta_mean, log_var = torch.split(op, op.size(2) // 2, dim=2) log_var = torch.sigmoid(log_var) # in [0, 1] log_var = self.min_log_var + \ (self.max_log_var - self.min_log_var) * log_var var = torch.exp(log_var) # normal scale, not log return delta_mean, var def forward(self, states, actions): """ predict next state mean and variance. takes in raw states and actions and internally normalizes it. Args: states (torch tensor): (ensemble_size, batch size, dim_state) actions (torch tensor): (ensemble_size, batch size, dim_action) Returns: next state means (torch tensor): (ensemble_size, batch size, dim_state) next state variances (torch tensor): (ensemble_size, batch size, dim_state) """ normalized_states, normalized_actions = self._pre_process_model_inputs( states, actions) normalized_delta_mean, normalized_var = self._propagate_network( normalized_states, normalized_actions) delta_mean, var = self._post_process_model_outputs( normalized_delta_mean, normalized_var) next_state_mean = delta_mean + states.to(self.device) return next_state_mean, var def forward_all(self, states, actions): """ predict next state mean and variance of a batch of states and actions for all models. takes in raw states and actions and internally normalizes it. Args: states (torch tensor): (batch size, dim_state) actions (torch tensor): (batch size, dim_action) Returns: next state means (torch tensor): (batch size, ensemble_size, dim_state) next state variances (torch tensor): (batch size, ensemble_size, dim_state) """ states = states.unsqueeze(0).repeat(self.ensemble_size, 1, 1) actions = actions.unsqueeze(0).repeat(self.ensemble_size, 1, 1) next_state_means, next_state_vars = self(states, actions) # print(states) return next_state_means.transpose(0, 1), next_state_vars.transpose(0, 1) def sample(self, mean, var): """ sample next state, given next state mean and variance Args: mean (torch tensor): any shape var (torch tensor): any shape Returns: next state (torch tensor): same shape as inputs """ return Normal(mean, torch.sqrt(var)).sample() def loss(self, states, actions, state_deltas, training_noise_stdev=0): """ compute loss given states, actions and state_deltas the loss is actually computed between predicted state delta and actual state delta, both in normalized space Args: states (torch tensor): (ensemble_size, batch size, dim_state) actions (torch tensor): (ensemble_size, batch size, dim_action) state_deltas (torch tensor): (ensemble_size, batch size, dim_state) training_noise_stdev (float): noise to add to normalized state, action inputs and state delta outputs Returns: loss (torch 0-dim tensor): `.backward()` can be called on it to compute gradients """ states, actions = self._pre_process_model_inputs(states, actions) targets = self._pre_process_model_targets(state_deltas) if not np.allclose(training_noise_stdev, 0): states += torch.randn_like(states) * training_noise_stdev actions += torch.randn_like(actions) * training_noise_stdev targets += torch.randn_like(targets) * training_noise_stdev mu, var = self._propagate_network(states, actions) # delta and variance # negative log likelihood loss = (mu - targets)**2 / var + torch.log(var) loss = torch.mean(loss) return loss def likelihood(self, states, actions, next_states): """ input raw (un-normalized) states, actions and state_deltas Args: states (torch tensor): (ensemble_size, batch size, dim_state) actions (torch tensor): (ensemble_size, batch size, dim_action) next_states (torch tensor): (ensemble_size, batch size, dim_state) Returns: likelihood (torch tensor): (batch size) """ next_states = next_states.to(self.device) with torch.no_grad(): mu, var = self(states, actions) # next state and variance pdf = Normal(mu, torch.sqrt(var)) log_likelihood = pdf.log_prob(next_states) log_likelihood = log_likelihood.mean(dim=2).mean( dim=0) # mean over all state components and models return log_likelihood
def setup_normalizer(self, normalizer): # print(normalizer) self.normalizer = TransitionNormalizer() self.normalizer.set_state(normalizer.get_state())