def _get_consecutive_observations(self, start_idx, num_steps): if num_steps == 0 and not (isinstance(start_idx, int) or isinstance(start_idx, np.int)): observation = stack_list_of_tuples(self.memory[start_idx]) return Observation(*map(lambda x: x.unsqueeze(1), observation)) num_steps = max(1, num_steps) if start_idx + num_steps <= self.max_len: obs_list = self.memory[start_idx:start_idx + num_steps] else: # The trajectory is split by the circular buffer. delta_idx = start_idx + num_steps - self.max_len obs_list = np.concatenate( (self.memory[start_idx:self.max_len], self.memory[:delta_idx])) return stack_list_of_tuples(obs_list)
def simulate( self, initial_state, policy, initial_action=None, logger=None, stack_obs=True ): """Simulate a set of particles starting from `state' and following `policy'.""" if self.num_samples > 0: initial_state = repeat_along_dimension( initial_state, number=self.num_samples, dim=0 ) initial_state = initial_state.reshape(-1, *self.dynamical_model.dim_state) if initial_action is not None: initial_action = repeat_along_dimension( initial_action, number=self.num_samples, dim=0 ) initial_action = initial_action.reshape(*initial_state.shape[:-1], -1) trajectory = rollout_model( dynamical_model=self.dynamical_model, reward_model=self.reward_model, policy=policy, initial_state=initial_state, initial_action=initial_action, max_steps=self.num_steps, termination_model=self.termination_model, ) if not stack_obs: self._log_trajectory(trajectory) return trajectory else: observation = stack_list_of_tuples(trajectory, dim=initial_state.ndim - 1) self._log_observation(observation) return observation
def forward(self, observation): """Compute the losses. Given an Observation, it will compute the losses. Given a list of Trajectories, it tries to stack them to vectorize operations. If it fails, will iterate over the trajectories. """ if isinstance(observation, Observation): trajectories = [observation] elif len(observation) > 1: try: # When possible, stack to parallelize the trajectories. # This requires all trajectories to be equal of length. trajectories = [stack_list_of_tuples(observation)] except RuntimeError: trajectories = observation else: trajectories = observation self.reset_info() loss = Loss() for trajectory in trajectories: loss += self.actor_loss(trajectory) loss += self.critic_loss(trajectory) loss += self.regularization_loss(trajectory, len(trajectories)) return loss / len(trajectories)
def end_episode(self): """See `AbstractAgent.end_episode'. If the agent is training, and the base model is a GP Model, then add the transitions to the GP, and summarize and sparsify the GP Model. Then train the agent. """ if self.training: if isinstance(self.dynamical_model.base_model, ExactGPModel): observation = stack_list_of_tuples(self.last_trajectory) for transform in self.dataset.transformations: observation = transform(observation) print(colorize("Add data to GP Model", "yellow")) self.dynamical_model.base_model.add_data( observation.state, observation.action[ ..., :self.dynamical_model.base_model.dim_action[0]], observation.next_state, ) print(colorize("Summarize GP Model", "yellow")) self.dynamical_model.base_model.summarize_gp() for i, gp in enumerate(self.dynamical_model.base_model.gp): self.logger.update( **{f"gp{i} num inputs": len(gp.train_targets)}) if isinstance(gp, SparseGP): self.logger.update( **{f"gp{i} num inducing inputs": gp.xu.shape[0]}) self.learn() super().end_episode()
def test_stack_list_of_lists(): trajectory = [[1, 2, 3, 4], [20, 30, 40, 50], [3, 4, 5, 6], [40, 50, 60, 70]] stacked_trajectory = stack_list_of_tuples(trajectory) np.testing.assert_allclose(stacked_trajectory[0], np.array([1, 20, 3, 40])) np.testing.assert_allclose(stacked_trajectory[1], np.array([2, 30, 4, 50])) np.testing.assert_allclose(stacked_trajectory[2], np.array([3, 40, 5, 60])) np.testing.assert_allclose(stacked_trajectory[3], np.array([4, 50, 6, 70]))
def test_update(self, trajectory, preserve_origin): transformer = StateNormalizer(preserve_origin) trajectory = stack_list_of_tuples(trajectory) mean = torch.mean(trajectory.state, 0) var = torch.var(trajectory.state, 0) transformer.update(trajectory) torch.testing.assert_allclose(transformer._normalizer.mean, mean) torch.testing.assert_allclose(transformer._normalizer.variance, var)
def test_inverse(self, trajectory, preserve_origin): transformer = ActionNormalizer(preserve_origin) trajectory = stack_list_of_tuples(trajectory) transformer.update(trajectory) observation = get_observation() obs = observation.clone() inverse_observation = transformer.inverse(transformer(observation)) for x, y in zip(obs, inverse_observation): if x.shape == y.shape: torch.testing.assert_allclose(x, y)
def test_stack_list_of_observations(): trajectory = get_trajectory() stacked_trajectory = stack_list_of_tuples(trajectory) stacked_trajectory = stacked_trajectory.to_torch() assert type(stacked_trajectory) is Observation assert stacked_trajectory.state.shape == (3, 4) assert stacked_trajectory.action.shape == (3, 2) assert stacked_trajectory.next_state.shape == (3, 4) assert stacked_trajectory.reward.shape == (3, ) assert stacked_trajectory.done.shape == (3, ) for val in stacked_trajectory: assert val.dtype is torch.get_default_dtype()
def _update_model_posterior(self, last_trajectory): """Update model posterior of GP-models with new data.""" if isinstance(self.dynamical_model.base_model, ExactGPModel): observation = stack_list_of_tuples(last_trajectory) # Parallelize. if observation.action.shape[-1] > self.dynamical_model.dim_action[ 0]: observation.action = observation.action[ ..., :self.dynamical_model.dim_action[0]] for transform in self.train_set.transformations: observation = transform(observation) print(colorize("Add data to GP Model", "yellow")) self.dynamical_model.base_model.add_data(observation.state, observation.action, observation.next_state) print(colorize("Summarize GP Model", "yellow")) self.dynamical_model.base_model.summarize_gp()
def learn(self): """Train Policy Gradient Agent.""" trajectories = [ stack_list_of_tuples(t).clone() for t in self.trajectories ] def closure(): """Gradient calculation.""" self.optimizer.zero_grad() losses = self.algorithm(trajectories) losses.combined_loss.backward() torch.nn.utils.clip_grad_norm_(self.algorithm.parameters(), self.clip_gradient_val) return losses self._learn_steps(closure)
def simulate_model(self): """Simulate the model. The simulation is initialized by concatenating samples from: - The empirical initial state distribution. - A learned or fixed initial state distribution. - The empirical state distribution. """ # Samples from empirical initial state distribution. initial_states = self.initial_states.sample_batch( self.sim_initial_states_num_trajectories) # Samples from initial distribution. if self.sim_initial_dist_num_trajectories > 0: initial_states_ = self.initial_distribution.sample( (self.sim_initial_dist_num_trajectories, )) initial_states = torch.cat((initial_states, initial_states_), dim=0) # Samples from experience replay empirical distribution. if self.sim_memory_num_trajectories > 0: obs, *_ = self.dataset.sample_batch( self.sim_memory_num_trajectories) for transform in self.dataset.transformations: obs = transform.inverse(obs) initial_states_ = obs.state[:, 0, :] # obs is an n-step return. initial_states = torch.cat((initial_states, initial_states_), dim=0) initial_states = initial_states.unsqueeze(0) self.policy.reset() trajectory = rollout_model( dynamical_model=self.dynamical_model, reward_model=self.reward_model, policy=self.policy, initial_state=initial_states, max_steps=self.sim_num_steps, termination_model=self.termination_model, ) self.sim_trajectory = stack_list_of_tuples(trajectory) states = self.sim_trajectory.state.reshape( -1, *self.dynamical_model.dim_state) self.sim_dataset.append(states[::self.sim_num_subsample])
def evaluate_action_sequence(self, action_sequence, state): """Evaluate action sequence by performing a rollout.""" trajectory = stack_list_of_tuples( rollout_actions( self.dynamical_model, self.reward_model, self.action_scale * action_sequence, # scale actions. state, self.termination_model, ), dim=-2, ) returns = discount_sum(trajectory.reward, self.gamma) if self.terminal_reward: terminal_reward = self.terminal_reward(trajectory.next_state[..., -1, :]) returns = returns + self.gamma ** self.horizon * terminal_reward return returns
def test_call(self, trajectory, preserve_origin): transformer = StateNormalizer(preserve_origin) trajectory = stack_list_of_tuples(trajectory) transformer.update(trajectory) observation = get_observation() obs = observation.clone() transformed = transformer(observation) if preserve_origin: mean = 0 scale = torch.sqrt(transformer._normalizer.variance + transformer._normalizer.mean**2) else: mean = transformer._normalizer.mean scale = torch.sqrt(transformer._normalizer.variance) torch.testing.assert_allclose(transformed.state, (obs.state - mean) / scale) torch.testing.assert_allclose(transformed.next_state, obs.next_state) torch.testing.assert_allclose(transformed.action, obs.action) torch.testing.assert_allclose(transformed.reward, obs.reward) assert transformed.done == obs.done
def test_correctness(self, gamma, value_function, entropy_reg): trajectory = [ Observation(0, 0, reward=1, done=False, entropy=0.2).to_torch(), Observation(0, 0, reward=0.5, done=False, entropy=0.3).to_torch(), Observation(0, 0, reward=2, done=False, entropy=0.5).to_torch(), Observation(0, 0, reward=-0.2, done=False, entropy=-0.2).to_torch(), ] r0 = 1 + entropy_reg * 0.2 r1 = 0.5 + entropy_reg * 0.3 r2 = 2 + entropy_reg * 0.5 r3 = -0.2 - entropy_reg * 0.2 v = 0.01 if value_function is not None else 0 reward = mc_return( stack_list_of_tuples(trajectory, -2), gamma, value_function=value_function, entropy_regularization=entropy_reg, reduction="min", ) torch.testing.assert_allclose( reward, torch.tensor([ r0 + r1 * gamma + r2 * gamma**2 + r3 * gamma**3 + v * gamma**4 ]), ) assert (mc_return( Observation(state=0, reward=0).to_torch(), gamma, value_function, entropy_reg, ) == 0)
def all_raw(self): """Get all the un-transformed data.""" all_raw = stack_list_of_tuples(self.memory[self.valid_indexes]) return all_raw
def plot_pendulum_trajectories(agent, environment, episode: int): """Plot GP inputs and trajectory in a Pendulum environment.""" model = agent.dynamical_model.base_model trajectory = stack_list_of_tuples(agent.last_trajectory) sim_obs = agent.sim_trajectory for transformation in agent.dataset.transformations: trajectory = transformation(trajectory) sim_obs = transformation(sim_obs) if isinstance(model, ExactGPModel): fig, axes = plt.subplots( 1 + model.dim_state[0] // 2, 2, sharex="col", sharey="row" ) else: fig, axes = plt.subplots(1, 2, sharex="col", sharey="row") axes = axes[np.newaxis] fig.set_size_inches(5.5, 2.0) # Plot real trajectory sin, cos = torch.sin(trajectory.state[:, 0]), torch.cos(trajectory.state[:, 0]) axes[0, 0].scatter( torch.atan2(sin, cos) * 180 / np.pi, trajectory.state[:, 1], c=trajectory.action[:, 0], cmap="jet", vmin=-1, vmax=1, ) axes[0, 0].set_title("Real Trajectory") # Plot sim trajectory sin = torch.sin(sim_obs.state[:, 0, 0, 0]) cos = torch.cos(sim_obs.state[:, 0, 0, 0]) axes[0, 1].scatter( torch.atan2(sin, cos) * 180 / np.pi, sim_obs.state[:, 0, 0, 1], c=sim_obs.action[:, 0, 0, 0], cmap="jet", vmin=-1, vmax=1, ) axes[0, 1].set_title("Optimistic Trajectory") if isinstance(model, ExactGPModel): for i in range(model.dim_state[0]): inputs = model.gp[i].train_inputs[0] sin, cos = inputs[:, 1], inputs[:, 0] axes[1 + i // 2, i % 2].scatter( torch.atan2(sin, cos) * 180 / np.pi, inputs[:, 2], c=inputs[:, 3], cmap="jet", vmin=-1, vmax=1, ) axes[1 + i // 2, i % 2].set_title(f"GP {i} data.") if hasattr(model.gp[i], "xu"): inducing_points = model.gp[i].xu sin, cos = inducing_points[:, 1], inducing_points[:, 0] axes[1 + i // 2, i % 2].scatter( torch.atan2(sin, cos) * 180 / np.pi, inducing_points[:, 2], c=inducing_points[:, 3], cmap="jet", marker="*", vmin=-1, vmax=1, ) for ax_row in axes: for ax in ax_row: ax.set_xlim([-180, 180]) ax.set_ylim([-15, 15]) for i in range(axes.shape[0]): axes[i, 0].set_ylabel("Angular Velocity [rad/s]") for j in range(axes.shape[1]): axes[-1, j].set_xlabel("Angle [degree]") # img_name = f"{agent.comment.title()}" if "optimistic" in agent.comment.lower(): name = "H-UCRL" elif "expected" in agent.comment.lower(): name = "Greedy" elif "thompson" in agent.comment.lower(): name = "Thompson" else: raise NotImplementedError plt.suptitle(f"{name} Episode {episode + 1}", x=0.53, y=0.96) plt.tight_layout() plt.savefig(f"{agent.logger.log_dir}/{episode + 1}.pdf") if "DISPLAY" in os.environ: plt.show() plt.close(fig)
def _log_trajectory(self, trajectory): """Log the simulated trajectory.""" observation = stack_list_of_tuples(trajectory, dim=trajectory[0].state.ndim - 1) self._log_observation(observation)
def mb_return( state, dynamical_model, reward_model, policy, num_steps=1, gamma=1.0, value_function=None, num_samples=1, entropy_reg=0.0, reward_transformer=RewardTransformer(), termination_model=None, reduction="none", ): r"""Estimate the value of a state by propagating the state with a model for N-steps. Rolls out the model for a number of `steps` and sums up the rewards. After this, it bootstraps using the value function. With T = steps: .. math:: V(s) = \sum_{t=0}^T \gamma^t r(s, \pi(s)) + \gamma^{T+1} V(s_{T+1}) Note that `steps=0` means that the model is still used to predict the next state. Parameters ---------- state: torch.Tensor Initial state from which planning starts. It accepts a batch of initial states. dynamical_model: AbstractModel The model predicts a distribution over next states given states and actions. reward_model: AbstractReward The reward predicts a distribution over floats or ints given states and actions. policy: AbstractPolicy The policy predicts a distribution over actions given the state. num_steps: int, optional. (default=1). Number of steps predicted with the model before (optionally) bootstrapping. gamma: float, optional. (default=1.). Discount factor. value_function: AbstractValueFunction, optional. (default=None). The value function used for bootstrapping, takes states as input. num_samples: int, optional. (default=0). The states are repeated `num_repeats` times in order to estimate the expected value by MC sampling of the policy, rewards and dynamics (jointly). entropy_reg: float, optional. (default=0). Entropy regularization parameter. termination_model: AbstractModel, optional. (default=None). Callable that returns True if the transition yields a terminal state. reward_transformer: RewardTransformer. Returns ------- return: DynaReturn q_target: Num_samples of MC estimation of q-function target. trajectory: Sample trajectory that MC estimation produces. References ---------- Lowrey, K., Rajeswaran, A., Kakade, S., Todorov, E., & Mordatch, I. (2018). Plan online, learn offline: Efficient learning and exploration via model-based control. ICLR. Sutton, R. S. (1991). Dyna, an integrated architecture for learning, planning, and reacting. ACM. Silver, D., Sutton, R. S., & Müller, M. (2008). Sample-based learning and search with permanent and transient memories. ICML. """ # Repeat states to get a better estimate of the expected value state = repeat_along_dimension(state, number=num_samples, dim=0) trajectory = rollout_model( dynamical_model=dynamical_model, reward_model=reward_model, policy=policy, initial_state=state, max_steps=num_steps, termination_model=termination_model, ) observation = stack_list_of_tuples(trajectory, dim=state.ndim - 1) value = mc_return( observation=observation, gamma=gamma, value_function=value_function, entropy_regularization=entropy_reg, reward_transformer=reward_transformer, reduction=reduction, ) return MBValueReturn(value, observation)