def test_adr_reward_generator(env): reference_env = env random_env = deepcopy(env) reward_generator = RewardGenerator( env_spec=random_env.spec, batch_size=256, reward_multiplier=1, lr=5e-3, ) policy = FNNPolicy(reference_env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) dr = create_default_randomizer_omo() dr.randomize(num_samples=1) random_env.domain_param = dr.get_params(fmt="dict", dtype="numpy") reference_sampler = ParallelRolloutSampler(reference_env, policy, num_workers=1, min_steps=1000) random_sampler = ParallelRolloutSampler(random_env, policy, num_workers=1, min_steps=1000) losses = [] for i in range(200): reference_traj = StepSequence.concat(reference_sampler.sample()) random_traj = StepSequence.concat(random_sampler.sample()) losses.append(reward_generator.train(reference_traj, random_traj, 10)) assert losses[len(losses) - 1] < losses[0]
def push(self, ros: Union[list, StepSequence], truncate_last: bool = True): """ Save a sequence of steps and drop of steps if the capacity is exceeded. :param ros: list of rollouts or one concatenated rollout :param truncate_last: remove the last step from each rollout, forwarded to `StepSequence.concat` """ if isinstance(ros, list): # Concatenate given rollouts if necessary ros = StepSequence.concat(ros) elif isinstance(ros, StepSequence): pass else: pyrado.TypeErr(given=ros, expected_type=[list, StepSequence]) # Add new steps if self.isempty: self._memory = deepcopy(ros) # on the very first call else: self._memory = StepSequence.concat([self._memory, ros], truncate_last=truncate_last) num_surplus = self._memory.length - self.capacity if num_surplus > 0: # Drop surplus of old steps self._memory = self._memory[num_surplus:]
def test_concat(data_format): # Create some rollouts with random rewards ros = [ StepSequence(rewards=np.random.randn(5), observations=np.random.randn(6), actions=np.random.randn(5), policy_infos={'mean': np.random.randn(5)}, hidden=(np.random.randn(5), np.random.randn(5)), data_format=data_format), StepSequence(rewards=np.random.randn(5), observations=np.random.randn(6), actions=np.random.randn(5), policy_infos={'mean': np.random.randn(5)}, hidden=(np.random.randn(5), np.random.randn(5)), data_format=data_format) ] # Perform concatenation cat = StepSequence.concat(ros) assert cat.continuous assert cat.rollout_count == 2 # Check steps for step_ro, step_cat in zip(itertools.chain.from_iterable(ros), cat): assert step_ro.reward == step_cat.reward assert step_ro.observation == step_cat.observation assert step_ro.done == step_cat.done
def test_replay_memory(capacity): rm = ReplayMemory(capacity) # Create fake rollouts (of length 5) ro1 = StepSequence(rewards=rewards, observations=observations, actions=actions, hidden=hidden) ro2 = StepSequence(rewards=rewards, observations=observations, actions=actions, hidden=hidden) # Concatenate them for testing only ros = StepSequence.concat( [ro1, ro2], truncate_last=True) # same truncate_last behavior as push function # Check the lengths rm.push(ro1) assert len(rm) == len(ro1) or len(rm) == capacity rm.push(ro2) assert len(rm) == len(ro1) + len(ro1) or len(rm) == capacity # Check the elements shift = len(ros) - capacity if shift < len(ro1): assert all(rm.memory.observations[0] == ros.observations[shift]) assert all(rm.memory.observations[-1] == ro2.observations[-2]) # -2 since one was truncated
def update(self, *args: Any, **kwargs: Any): """Update the policy's (and value functions') parameters based on the collected rollout data.""" obss = [] losses = [] for t in range(self.num_teachers): concat_ros = StepSequence.concat(kwargs["rollouts"][t]) concat_ros.torch(data_type=to.get_default_dtype()) obss.append(concat_ros.get_data_values("observations")[: self.min_steps]) # Train student for epoch in range(self.num_epochs): self.optimizer.zero_grad() loss = 0 for t_idx, teacher in enumerate(self.teacher_policies): s_dist = self.expl_strat.action_dist_at(self.policy(obss[t_idx])) s_act = s_dist.sample() t_dist = self.teacher_expl_strats[t_idx].action_dist_at(teacher(obss[t_idx])) l = self.teacher_weights[t_idx] * self.criterion(t_dist.log_prob(s_act), s_dist.log_prob(s_act)) loss += l losses.append([t_idx, l.item()]) print(f"Epoch {epoch} Loss: {loss.item()}") loss.backward() self.optimizer.step()
def test_basic_policy_evaluate_packed_padded_sequences( env: Env, policy: RecurrentPolicy): # Test packed padded sequence implementation against old implementation def old_evaluate(rollout: StepSequence, hidden_states_name: str = "hidden_states") -> to.Tensor: # Set policy, i.e. PyTorch nn.Module, to evaluation mode policy.eval() # The passed sample collection might contain multiple rollouts. act_list = [] for ro in rollout.iterate_rollouts(): if hidden_states_name in rollout.data_names: # Get initial hidden state from first step hidden = policy._unpack_hidden(ro[0][hidden_states_name]) else: # Let the network pick the default hidden state hidden = None # Reshape observations to match PyTorch's RNN sequence protocol obs = ro.get_data_values("observations", True).unsqueeze(1) obs = obs.to(device=policy.device, dtype=to.get_default_dtype()) # Pass the input through hidden RNN layers out, _ = policy.rnn_layers(obs, hidden) # And through the output layer act = policy.output_layer(out.squeeze(1)) if policy.output_nonlin is not None: act = policy.output_nonlin(act) # Collect the actions act_list.append(act) # Set policy, i.e. PyTorch nn.Module, back to training mode policy.train() return to.cat(act_list) # Get some rollouts ros = [] for i in range(5): ro = rollout(env, policy, eval=True, render_mode=RenderMode()) ro.torch(to.get_default_dtype()) ros.append(ro) # Perform concatenation cat = StepSequence.concat(ros) # Evaluate old and new approaches act_old = old_evaluate(cat) act_new = policy.evaluate(cat) to.testing.assert_allclose(act_old, act_new)
def test_adr_reward_generator(env): reference_env = env random_env = deepcopy(env) reward_generator = RewardGenerator( env_spec=random_env.spec, batch_size=100, reward_multiplier=1, logger=None ) policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh) dr = get_default_randomizer_omo() dr.randomize(num_samples=1) random_env.domain_param = dr.get_params(format='dict', dtype='numpy') reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000) random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000) losses = [] for i in range(50): reference_traj = StepSequence.concat(reference_sampler.sample()) random_traj = StepSequence.concat(random_sampler.sample()) losses.append(reward_generator.train(reference_traj, random_traj, 10)) assert losses[len(losses) - 1] < losses[0]
def update(self, rollouts: Sequence[StepSequence]): r""" Train the particles $mu$. :param rollouts: rewards collected from the rollout """ policy_grads = [] parameters = [] for i in range(self.num_particles): # Get the rollouts associated to the i-th particle concat_ros = StepSequence.concat(rollouts[i]) concat_ros.torch() act_stats = compute_action_statistics(concat_ros, self.expl_strats[i]) act_stats_fixed = compute_action_statistics( concat_ros, self.fixed_expl_strats[i]) klds = to.distributions.kl_divergence(act_stats.act_distr, act_stats_fixed.act_distr) entropy = act_stats.act_distr.entropy() log_prob = act_stats.log_probs concat_ros.rewards = concat_ros.rewards - ( 0.1 * klds.mean(1)).view(-1) - 0.1 * entropy.mean(1).view(-1) # Update the advantage estimator's parameters and return advantage estimates adv = self.particles[i].critic.update(rollouts[i], use_empirical_returns=True) # Estimate policy gradients self.optimizers[i].zero_grad() policy_grad = -to.mean(log_prob * adv.detach()) policy_grad.backward() # step comes later than usual # Collect flattened parameter and gradient vectors policy_grads.append(self.expl_strats[i].param_grad) parameters.append(self.expl_strats[i].param_values) parameters = to.stack(parameters) policy_grads = to.stack(policy_grads) Kxx, dx_Kxx = self.kernel(parameters) grad_theta = (to.mm(Kxx, policy_grads / self.temperature) + dx_Kxx) / self.num_particles for i in range(self.num_particles): self.expl_strats[i].param_grad = grad_theta[i] self.optimizers[i].step() self.updatecount += 1
def test_potential_policy_evaluate_packed_padded_sequences( env: Env, policy: RecurrentPolicy): # Test packed padded sequence implementation for custom recurrent neural networks # Get some rollouts ros = [] for i in range(5): ro = rollout(env, policy, eval=True, render_mode=RenderMode()) ro.torch(to.get_default_dtype()) ros.append(ro) # Perform concatenation cat = StepSequence.concat(ros) # Evaluate old and new approaches act_new = policy.evaluate(cat) assert act_new is not None
def test_twoheaded_policy_evaluate_packed_padded_sequences( env: Env, policy: RecurrentPolicy): # Test packed padded sequence implementation for custom recurrent neural networks def old_evaluate(rollout: StepSequence, hidden_states_name: str = "hidden_states") -> to.Tensor: # Set policy, i.e. PyTorch nn.Module, to evaluation mode policy.eval() act_list = [] head2_list = [] for ro in rollout.iterate_rollouts(): if hidden_states_name in rollout.data_names: # Get initial hidden state from first step hidden = ro[0][hidden_states_name] else: # Let the network pick the default hidden state hidden = None # Run steps consecutively reusing the hidden state for step in ro: act, head2, hidden = policy(step.observation, hidden) act_list.append(act) head2_list.append(head2) # Set policy, i.e. PyTorch nn.Module, back to training mode policy.train() return to.stack(act_list), to.stack(head2_list) # Get some rollouts ros = [] for i in range(5): ro = rollout(env, policy, eval=True, render_mode=RenderMode()) ro.torch(to.get_default_dtype()) ros.append(ro) # Perform concatenation cat = StepSequence.concat(ros) # Evaluate old and new approaches output_1_old, output_2_old = old_evaluate(cat) output_1_new, output_2_new = policy.evaluate(cat) to.testing.assert_allclose(output_1_old, output_1_new) to.testing.assert_allclose(output_2_old, output_2_new)
def test_concat_rollouts(env, expl_strat): ro1 = rollout(env, expl_strat) ro2 = rollout(env, expl_strat) ro_cat = StepSequence.concat([ro1, ro2]) assert isinstance(ro_cat, StepSequence) assert ro_cat.length == ro1.length + ro2.length
def plot_rollouts_segment_wise( plot_type: str, segments_ground_truth: List[List[StepSequence]], segments_multiple_envs: List[List[List[StepSequence]]], segments_nominal: List[List[StepSequence]], use_rec_str: bool, idx_iter: int, idx_round: int, state_labels: Optional[Iterable[str]] = None, act_labels: Optional[Iterable[str]] = None, x_limits: Optional[Tuple[int]] = None, plot_act: bool = False, data_field: str = "states", cmap_samples: Optional[colors.Colormap] = None, save_dir: Optional[pyrado.PathLike] = None, file_format: Iterable[str] = ("pdf", "pgf", "png"), ) -> List[plt.Figure]: r""" Plot the different rollouts in separate figures and the different state dimensions along the columns. :param plot_type: type of plot, pass "samples" to plot the rollouts of the most likely domain parameters as individual lines, or pass "confidence" to plot the most likely one, and the mean $\pm$ 1 std :param segments_ground_truth: list of lists containing rollout segments from the ground truth environment :param segments_multiple_envs: list of lists of lists containing rollout segments from different environment instances, e.g. samples from a posterior coming from `NDPR` :param segments_nominal: list of lists containing rollout segments from the nominal environment :param use_rec_str: `True` if pre-recorded actions have been used to generate the rollouts :param idx_iter: selected iteration :param idx_round: selected round :param state_labels: y-axes labels to for the state trajectories, no label by default :param act_labels: y-axes labels to for the action trajectories, no label by default :param x_limits: tuple containing the lower and upper limits for the x-axis :param plot_act: if `True`, also plot the actions :param data_field: data field of the rollout, e.g. "states" or "observations" :param cmap_samples: color map for the trajectories resulting from different domain parameter samples :param save_dir: if not `None` create a subfolder plots in `save_dir` and save the plots in there :param file_format: select the file format to store the plots :return: list of handles to the created figures """ if plot_type not in ["samples", "confidence"]: raise pyrado.ValueErr(given=plot_type, eq_constraint="samples or confidence") if data_field not in ["states", "observations"]: raise pyrado.ValueErr(given=data_field, eq_constraint="states or observations") # Extract the state dimension, and the number of most likely samples from the data dim_state = segments_ground_truth[0][0].get_data_values(data_field)[ 0, :].size dim_act = segments_ground_truth[0][0].get_data_values("actions")[0, :].size num_samples = len(segments_multiple_envs[0][0]) # Validate the labels if state_labels is None: state_labels = [""] * dim_state else: if len(state_labels) != dim_state: raise pyrado.ShapeErr(given=state_labels, expected_match=(dim_state, )) if act_labels is None: act_labels = [""] * dim_act else: if len(act_labels) != dim_act: raise pyrado.ShapeErr(given=act_labels, expected_match=(dim_act, )) if cmap_samples is None: cmap_samples = plt.get_cmap("Reds")(np.linspace(0.6, 0.8, num_samples)) fig_list = [] label_samples = "ml" if plot_type == "confidence" else "samples" # Plot for idx_r in range(len(segments_ground_truth)): num_rows = dim_state + dim_act if plot_act else dim_state fig, axs = plt.subplots(nrows=num_rows, figsize=(16, 9), tight_layout=True, sharex="col") axs = np.atleast_1d(axs) # Plot the states for idx_state in range(dim_state): # Plot the real segments cnt_step = [0] for segment_gt in segments_ground_truth[idx_r]: axs[idx_state].plot( np.arange(cnt_step[-1], cnt_step[-1] + segment_gt.length), segment_gt.get_data_values(data_field, truncate_last=True)[:, idx_state], zorder=0, c="black", lw=1.0, label="target" if cnt_step[-1] == 0 else "", # print once ) cnt_step.append(cnt_step[-1] + segment_gt.length) # Plot the maximum likely simulated segments for idx_seg, sml in enumerate(segments_multiple_envs[idx_r]): for idx_dp, sdp in enumerate(sml): axs[idx_state].plot( np.arange(cnt_step[idx_seg], cnt_step[idx_seg] + sdp.length), sdp.get_data_values(data_field, truncate_last=True)[:, idx_state], zorder=2 if idx_dp == 0 else 0, # most likely on top c=cmap_samples[idx_dp], ls="-", lw=1.5 if plot_type == "confidence" or idx_dp == 0 else 0.5, alpha=1.0 if plot_type == "confidence" or idx_dp == 0 else 0.1, label=label_samples if cnt_step[idx_seg] == idx_seg == idx_dp == 0 else "", # print once ) if plot_type != "samples": # Stop here, unless the rollouts of most likely all domain parameters should be plotted break if plot_type == "confidence": assert check_all_lengths_equal( segments_multiple_envs[idx_r] [0]) # all segments need to be equally long states_all = [] for idx_dp in range(num_samples): # Reconstruct the step sequences for all domain parameters ss_dp = StepSequence.concat([ seg_dp[idx_dp] for seg_dp in [segs_ro for segs_ro in segments_multiple_envs[idx_r]] ]) states = ss_dp.get_data_values(data_field, truncate_last=True) states_all.append(states) states_all = np.stack(states_all, axis=0) states_mean = np.mean(states_all, axis=0) states_std = np.std(states_all, axis=0) for idx_seg in range(len(segments_multiple_envs[idx_r])): len_segs = min([ len(seg) for seg in segments_multiple_envs[idx_r][idx_seg] ]) # use shortest m_i = states_mean[cnt_step[idx_seg]:cnt_step[idx_seg] + len_segs, idx_state] s_i = states_std[cnt_step[idx_seg]:cnt_step[idx_seg] + len_segs, idx_state] draw_curve( "std", axs[idx_state], pd.DataFrame(dict(mean=m_i, std=s_i)), x_grid=np.arange(cnt_step[idx_seg], cnt_step[idx_seg] + len_segs), show_legend=False, area_label="2 std" if idx_seg == 0 else None, plot_kwargs=dict(color=cmap_samples[0]), ) # Plot the nominal simulation's segments for idx_seg, sn in enumerate(segments_nominal[idx_r]): axs[idx_state].plot( np.arange(cnt_step[idx_seg], cnt_step[idx_seg] + sn.length), sn.get_data_values(data_field, truncate_last=True)[:, idx_state], zorder=2, c="green", # former: steelblue" ls="--", lw=1.0, label="nom sim" if cnt_step[idx_seg] == 0 else "", # print once ) axs[idx_state].set_ylabel(state_labels[idx_state]) if plot_act: # Plot the actions for idx_act in range(dim_act): # Plot the real segments cnt_step = [0] for segment_gt in segments_ground_truth[idx_r]: axs[dim_state + idx_act].plot( np.arange(cnt_step[-1], cnt_step[-1] + segment_gt.length), segment_gt.get_data_values( "actions", truncate_last=False)[:, idx_act], zorder=0, c="black", label="target" if cnt_step[-1] == 0 else "", # print once ) cnt_step.append(cnt_step[-1] + segment_gt.length) # Plot the maximum likely simulated segments for idx_seg, sml in enumerate(segments_multiple_envs[idx_r]): for idx_dp, sdp in enumerate(sml): axs[dim_state + idx_act].plot( np.arange(cnt_step[idx_seg], cnt_step[idx_seg] + sdp.length), sdp.get_data_values("actions", truncate_last=False)[:, idx_act], zorder=2 if idx_dp == 0 else 0, # most likely on top c=cmap_samples[idx_dp], ls="--", lw=1.5 if plot_type == "confidence" or idx_dp == 0 else 0.5, alpha=1.0 if plot_type == "confidence" or idx_dp == 0 else 0.4, label=label_samples if cnt_step[idx_seg] == idx_seg == idx_dp == 0 else "", # print once ) if plot_type != "samples": # Stop here, unless the rollouts of most likely all domain parameters should be plotted break if plot_type == "confidence": len_segs = len(segments_multiple_envs[idx_r][0][0]) assert check_all_lengths_equal( segments_multiple_envs[idx_r] [0]) # all segments need to be equally long acts_all = [] for idx_dp in range(num_samples): # Reconstruct the step sequences for all domain parameters ss_dp = StepSequence.concat([ seg_dp[idx_dp] for seg_dp in [ segs_ro for segs_ro in segments_multiple_envs[idx_r] ] ]) acts = ss_dp.get_data_values("actions", truncate_last=False) acts_all.append(acts) acts_all = np.stack(acts_all, axis=0) acts_mean = np.mean(acts_all, axis=0) acts_std = np.std(acts_all, axis=0) for idx_seg in range(len(segments_multiple_envs[idx_r])): m_i = acts_mean[cnt_step[idx_seg]:cnt_step[idx_seg] + len_segs, idx_act] s_i = acts_std[cnt_step[idx_seg]:cnt_step[idx_seg] + len_segs, idx_act] draw_curve( "std", axs[dim_state + idx_act], pd.DataFrame(dict(mean=m_i, std=s_i)), x_grid=np.arange(cnt_step[idx_seg], cnt_step[idx_seg] + len_segs), show_legend=False, area_label="2 std" if idx_seg == 0 else None, plot_kwargs=dict(color=cmap_samples[0]), ) # Plot the nominal simulation's segments for idx_seg, sn in enumerate(segments_nominal[idx_r]): axs[dim_state + idx_act].plot( np.arange(cnt_step[idx_seg], cnt_step[idx_seg] + sn.length), sn.get_data_values("actions", truncate_last=False)[:, idx_act], zorder=2, c="steelblue", ls="-.", label="nom sim" if cnt_step[idx_seg] == 0 else "", # print once ) axs[dim_state + idx_act].set_ylabel(act_labels[idx_act]) # Settings for all subplots for idx_axs in range(num_rows): if x_limits is not None: axs[idx_axs].set_xlim(x_limits[0], x_limits[1]) # Set window title and the legend, placing the latter above the plot expanding and expanding it fully use_rec_str = ", using rec actions" if use_rec_str else "" round_str = f"round {idx_round}, " if idx_round != -1 else "" fig.canvas.manager.set_window_title( f"Target Domain and Simulated Rollouts (iteration {idx_iter}, {round_str}rollout {idx_r}{use_rec_str})" ) lg = axs[0].legend( ncol=2 + num_samples, bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), loc="lower left", mode="expand", borderaxespad=0.0, ) # Save if desired if save_dir is not None: for fmt in file_format: os.makedirs(os.path.join(save_dir, "plots"), exist_ok=True) len_seg_str = f"seglen_{segments_ground_truth[0][0].length}" use_rec_str = "_use_rec" if use_rec_str else "" round_str = f"_round_{idx_round}" if idx_round != -1 else "" fig.savefig( os.path.join( save_dir, "plots", f"posterior_iter_{idx_iter}{round_str}_rollout_{idx_r}_{len_seg_str}{use_rec_str}.{fmt}", ), bbox_extra_artists=(lg, ), dpi=150, ) # Append current figure fig_list.append(fig) return fig_list
def update(self, rollouts: Sequence[StepSequence], use_empirical_returns: bool = False): """ Adapt the parameters of the advantage function estimator, minimizing the MSE loss for the given samples. :param rollouts: batch of rollouts :param use_empirical_returns: use the return from the rollout (True) or the ones from the V-fcn (False) :return adv: tensor of advantages after V-function updates """ # Turn the batch of rollouts into a list of steps concat_ros = StepSequence.concat(rollouts) concat_ros.torch(data_type=to.get_default_dtype()) if use_empirical_returns: # Compute the value targets (empirical discounted returns) for all samples v_targ = discounted_values(rollouts, self.gamma).view(-1, 1) else: # Use the value function to compute the value targets (also called bootstrapping) v_targ = self.tdlamda_returns(concat_ros=concat_ros) concat_ros.add_data('v_targ', v_targ) # Logging with to.no_grad(): v_pred_old = self.values(concat_ros) loss_old = self.loss_fcn(v_pred_old, v_targ) vfcn_grad_norm = [] # Iterate over all gathered samples num_epoch times for e in range(self.num_epoch): for batch in tqdm(concat_ros.split_shuffled_batches( self.batch_size, complete_rollouts=isinstance(self.vfcn, RecurrentPolicy)), total=num_iter_from_rollouts(None, concat_ros, self.batch_size), desc=f'Epoch {e}', unit='batches', file=sys.stdout, leave=False): # Reset the gradients self.optim.zero_grad() # Make predictions for this mini-batch using values function v_pred = self.values(batch) # Compute estimator loss for this mini-batch and backpropagate vfcn_loss = self.loss_fcn(v_pred, batch.v_targ) vfcn_loss.backward() # Clip the gradients if desired vfcn_grad_norm.append(Algorithm.clip_grad(self.vfcn, self.max_grad_norm)) # Call optimizer self.optim.step() # Update the learning rate if a scheduler has been specified if self._lr_scheduler is not None: self._lr_scheduler.step() # Estimate the advantage after fitting the parameters of the V-fcn adv = self.gae(concat_ros) # is done with to.no_grad() with to.no_grad(): v_pred_new = self.values(concat_ros) loss_new = self.loss_fcn(v_pred_new, v_targ) vfcn_loss_impr = loss_old - loss_new # positive values are desired explvar = explained_var(v_pred_new, v_targ) # values close to 1 are desired # Log metrics computed from the old value function (before the update) self.logger.add_value('explained var critic', explvar, 4) self.logger.add_value('loss improv critic', vfcn_loss_impr, 4) self.logger.add_value('avg grad norm critic', np.mean(vfcn_grad_norm), 4) if self._lr_scheduler is not None: self.logger.add_value('lr critic', self._lr_scheduler.get_last_lr(), 6) return adv
def update(self, rollouts: Sequence[StepSequence]): # Turn the batch of rollouts into a list of steps concat_ros = StepSequence.concat(rollouts) concat_ros.torch(data_type=to.get_default_dtype()) with to.no_grad(): # Compute the action probabilities using the old (before update) policy act_stats = compute_action_statistics(concat_ros, self._expl_strat) log_probs_old = act_stats.log_probs act_distr_old = act_stats.act_distr # Compute value predictions using the old old (before update) value function v_pred_old = self._critic.values(concat_ros) # Attach advantages and old log probs to rollout concat_ros.add_data('log_probs_old', log_probs_old) concat_ros.add_data('v_pred_old', v_pred_old) # For logging the gradient norms policy_grad_norm = [] value_fcn_grad_norm = [] # Compute the value targets (empirical discounted returns) for all samples before fitting the V-fcn parameters adv = self._critic.gae(concat_ros) # done with to.no_grad() v_targ = discounted_values(rollouts, self._critic.gamma).view( -1, 1) # empirical discounted returns concat_ros.add_data('adv', adv) concat_ros.add_data('v_targ', v_targ) # Iterations over the whole data set for e in range(self.num_epoch): for batch in tqdm(concat_ros.split_shuffled_batches( self.batch_size, complete_rollouts=self._policy.is_recurrent or isinstance(self._critic.value_fcn, RecurrentPolicy)), total=num_iter_from_rollouts( None, concat_ros, self.batch_size), desc=f'Epoch {e}', unit='batches', file=sys.stdout, leave=False): # Reset the gradients self.optim.zero_grad() # Compute log of the action probabilities for the mini-batch log_probs = compute_action_statistics( batch, self._expl_strat).log_probs.to(self.policy.device) # Compute value predictions for the mini-batch v_pred = self._critic.values(batch) # Compute combined loss and backpropagate loss = self.loss_fcn(log_probs, batch.log_probs_old, batch.adv, v_pred, batch.v_pred_old, batch.v_targ) loss.backward() # Clip the gradients if desired policy_grad_norm.append( self.clip_grad(self._expl_strat.policy, self.max_grad_norm)) value_fcn_grad_norm.append( self.clip_grad(self._critic.value_fcn, self.max_grad_norm)) # Call optimizer self.optim.step() if to.isnan(self._expl_strat.noise.std).any(): raise RuntimeError( f'At least one exploration parameter became NaN! The exploration parameters are' f'\n{self._expl_strat.std.detach().numpy()}') # Update the learning rate if a scheduler has been specified if self._lr_scheduler is not None: self._lr_scheduler.step() # Additional logging if self.log_loss: with to.no_grad(): # Compute value predictions using the new (after the updates) value function approximator v_pred = self._critic.values(concat_ros).to(self.policy.device) v_loss_old = self._critic.loss_fcn( v_pred_old.to(self.policy.device), v_targ.to(self.policy.device)).to(self.policy.device) v_loss_new = self._critic.loss_fcn(v_pred, v_targ).to( self.policy.device) value_fcn_loss_impr = v_loss_old - v_loss_new # positive values are desired # Compute the action probabilities using the new (after the updates) policy act_stats = compute_action_statistics(concat_ros, self._expl_strat) log_probs_new = act_stats.log_probs act_distr_new = act_stats.act_distr loss_after = self.loss_fcn(log_probs_new, log_probs_old, adv, v_pred, v_pred_old, v_targ) kl_avg = to.mean(kl_divergence( act_distr_old, act_distr_new)) # mean seeking a.k.a. inclusive KL # Compute explained variance (after the updates) explvar = explained_var(v_pred, v_targ) self.logger.add_value('explained var', explvar.detach().numpy()) self.logger.add_value('V-fcn loss improvement', value_fcn_loss_impr.detach().numpy()) self.logger.add_value('loss after', loss_after.detach().numpy()) self.logger.add_value('KL(old_new)', kl_avg.item()) # Logging self.logger.add_value( 'avg expl strat std', to.mean(self._expl_strat.noise.std.data).detach().numpy()) self.logger.add_value('expl strat entropy', self._expl_strat.noise.get_entropy().item()) self.logger.add_value('avg policy grad norm', np.mean(policy_grad_norm)) self.logger.add_value('avg V-fcn grad norm', np.mean(value_fcn_grad_norm)) if self._lr_scheduler is not None: self.logger.add_value('learning rate', self._lr_scheduler.get_lr())
def update(self, rollouts: Sequence[StepSequence]): # Turn the batch of rollouts into a list of steps concat_ros = StepSequence.concat(rollouts) concat_ros.torch(data_type=to.get_default_dtype()) # Compute the value targets (empirical discounted returns) for all samples before fitting the V-fcn parameters adv = self._critic.gae(concat_ros) # done with to.no_grad() v_targ = discounted_values(rollouts, self._critic.gamma).view(-1, 1).to(self.policy.device) # empirical discounted returns with to.no_grad(): # Compute value predictions and the GAE using the old (before the updates) value function approximator v_pred = self._critic.values(concat_ros) # Compute the action probabilities using the old (before update) policy act_stats = compute_action_statistics(concat_ros, self._expl_strat) log_probs_old = act_stats.log_probs act_distr_old = act_stats.act_distr loss_before = self.loss_fcn(log_probs_old, adv, v_pred, v_targ) self.logger.add_value('loss before', loss_before, 4) concat_ros.add_data('adv', adv) concat_ros.add_data('v_targ', v_targ) # For logging the gradients' norms policy_grad_norm = [] for batch in tqdm(concat_ros.split_shuffled_batches( self.batch_size, complete_rollouts=self._policy.is_recurrent or isinstance(self._critic.vfcn, RecurrentPolicy)), total=num_iter_from_rollouts(None, concat_ros, self.batch_size), desc='Updating', unit='batches', file=sys.stdout, leave=False): # Reset the gradients self.optim.zero_grad() # Compute log of the action probabilities for the mini-batch log_probs = compute_action_statistics(batch, self._expl_strat).log_probs # Compute value predictions for the mini-batch v_pred = self._critic.values(batch) # Compute combined loss and backpropagate loss = self.loss_fcn(log_probs, batch.adv, v_pred, batch.v_targ) loss.backward() # Clip the gradients if desired policy_grad_norm.append(self.clip_grad(self.expl_strat.policy, self.max_grad_norm)) # Call optimizer self.optim.step() # Update the learning rate if a scheduler has been specified if self._lr_scheduler is not None: self._lr_scheduler.step() if to.isnan(self.expl_strat.noise.std).any(): raise RuntimeError(f'At least one exploration parameter became NaN! The exploration parameters are' f'\n{self.expl_strat.std.item()}') # Logging with to.no_grad(): # Compute value predictions and the GAE using the new (after the updates) value function approximator v_pred = self._critic.values(concat_ros).to(self.policy.device) adv = self._critic.gae(concat_ros) # done with to.no_grad() # Compute the action probabilities using the new (after the updates) policy act_stats = compute_action_statistics(concat_ros, self._expl_strat) log_probs_new = act_stats.log_probs act_distr_new = act_stats.act_distr loss_after = self.loss_fcn(log_probs_new, adv, v_pred, v_targ) kl_avg = to.mean( kl_divergence(act_distr_old, act_distr_new)) # mean seeking a.k.a. inclusive KL explvar = explained_var(v_pred, v_targ) # values close to 1 are desired self.logger.add_value('loss after', loss_after, 4) self.logger.add_value('KL(old_new)', kl_avg, 4) self.logger.add_value('explained var', explvar, 4) ent = self.expl_strat.noise.get_entropy() self.logger.add_value('avg expl strat std', to.mean(self.expl_strat.noise.std), 4) self.logger.add_value('expl strat entropy', to.mean(ent), 4) self.logger.add_value('avg grad norm policy', np.mean(policy_grad_norm), 4) if self._lr_scheduler is not None: self.logger.add_value('avg lr', np.mean(self._lr_scheduler.get_last_lr()), 6)
def update(self, rollouts: Sequence[StepSequence]): # Turn the batch of rollouts into a list of steps concat_ros = StepSequence.concat(rollouts) concat_ros.torch(data_type=to.get_default_dtype()) # Update the advantage estimator's parameters and return advantage estimates adv = self._critic.update(rollouts, use_empirical_returns=False) with to.no_grad(): # Compute the action probabilities using the old (before update) policy act_stats = compute_action_statistics(concat_ros, self._expl_strat) log_probs_old = act_stats.log_probs act_distr_old = act_stats.act_distr # Attach advantages and old log probs to rollout concat_ros.add_data('adv', adv) concat_ros.add_data('log_probs_old', log_probs_old) # For logging the gradient norms policy_grad_norm = [] # Iterations over the whole data set for e in range(self.num_epoch): for batch in tqdm(concat_ros.split_shuffled_batches( self.batch_size, complete_rollouts=self._policy.is_recurrent), total=num_iter_from_rollouts( None, concat_ros, self.batch_size), desc=f'Epoch {e}', unit='batches', file=sys.stdout, leave=False): # Reset the gradients self.optim.zero_grad() # Compute log of the action probabilities for the mini-batch log_probs = compute_action_statistics( batch, self._expl_strat).log_probs # Compute policy loss and backpropagate loss = self.loss_fcn(log_probs, batch.log_probs_old, batch.adv) loss.backward() # Clip the gradients if desired policy_grad_norm.append( self.clip_grad(self._expl_strat.policy, self.max_grad_norm)) # Call optimizer self.optim.step() if to.isnan(self._expl_strat.noise.std).any(): raise RuntimeError( f'At least one exploration parameter became NaN! The exploration parameters are' f'\n{self._expl_strat.std.detach().numpy()}') # Update the learning rate if a scheduler has been specified if self._lr_scheduler is not None: self._lr_scheduler.step() # Additional logging if self.log_loss: with to.no_grad(): act_stats = compute_action_statistics(concat_ros, self._expl_strat) log_probs_new = act_stats.log_probs act_distr_new = act_stats.act_distr loss_after = self.loss_fcn(log_probs_new, log_probs_old, adv) kl_avg = to.mean(kl_divergence( act_distr_old, act_distr_new)) # mean seeking a.k.a. inclusive KL self.logger.add_value('loss after', loss_after.detach().numpy()) self.logger.add_value('KL(old_new)', kl_avg.item()) # Logging self.logger.add_value( 'avg expl strat std', to.mean(self._expl_strat.noise.std.data).detach().numpy()) self.logger.add_value('expl strat entropy', self._expl_strat.noise.get_entropy().item()) self.logger.add_value('avg policy grad norm', np.mean(policy_grad_norm)) if self._lr_scheduler is not None: self.logger.add_value('learning rate', self._lr_scheduler.get_lr())
def experiment_w_distruber(env_real: RealEnv, env_sim: SimEnv): # Wrap the environment in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ro1 = rollout(env_real, policy, eval=True, max_steps=args.max_steps // 3, render_mode=RenderMode(), no_reset=True, no_close=True) # Run disturber env_real = inner_env(env_real) # since we are reusing it print_cbt('Running the 1st disturber ...', 'c', bright=True) rollout(env_real, disturber_pos, eval=True, max_steps=steps_disturb, render_mode=RenderMode(), no_reset=True, no_close=True) # Wrap the environment in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ro2 = rollout(env_real, policy, eval=True, max_steps=args.max_steps // 3, render_mode=RenderMode(), no_reset=True, no_close=True) # Run disturber env_real = inner_env(env_real) # since we are reusing it print_cbt('Running the 2nd disturber ...', 'c', bright=True) rollout(env_real, disturber_neg, eval=True, max_steps=steps_disturb, render_mode=RenderMode(), no_reset=True, no_close=True) # Wrap the environment in the same as done during training env_real = wrap_like_other_env(env_real, env_sim) # Run learned policy on the device print_cbt('Running the evaluation policy ...', 'c', bright=True) ro3 = rollout(env_real, policy, eval=True, max_steps=args.max_steps // 3, render_mode=RenderMode(), no_reset=True, no_close=True) return StepSequence.concat([ro1, ro2, ro3])
def step(self, snapshot_mode: str, meta_info: dict = None, parallel: bool = True): rand_trajs = [] ref_trajs = [] ros = [] visited = [] for i in range(self.svpg.num_particles): done = False svpg_env = self.svpg_wrapper state = svpg_env.reset() states = [] actions = [] rewards = [] infos = [] rand_trajs_now = [] ref_trajs_now = [] if parallel: with to.no_grad(): for t in range(10): action = self.svpg.expl_strats[i]( to.as_tensor(state, dtype=to.get_default_dtype())).detach().numpy() state = svpg_env.lite_step(action) states.append(state) actions.append(action) visited.append(states) rewards, rand_trajs_now, ref_trajs_now = svpg_env.eval_states(states) rand_trajs += rand_trajs_now ref_trajs += ref_trajs_now ros.append(StepSequence(observations=states, actions=actions, rewards=rewards)) else: with to.no_grad(): while not done: action = self.svpg.expl_strats[i]( to.as_tensor(state, dtype=to.get_default_dtype())).detach().numpy() state, reward, done, info = svpg_env.step(action) print(self.params.array_to_dict(state), ' => ', reward) states.append(state) rewards.append(reward) actions.append(action) infos.append(info) rand_trajs += info['rand'] ref_trajs += info['ref'] ros.append(StepSequence(observations=states, actions=actions, rewards=rewards)) self.logger.add_value(f'SVPG_agent_{i}_mean_reward', np.mean(rewards)) ros[i].torch(data_type=to.DoubleTensor) for rt in rand_trajs_now: rt.torch(data_type=to.double) rt.observations = rt.observations.double().detach() rt.actions = rt.actions.double().detach() self.subroutine.update(rand_trajs_now) # Logging rets = [ro.undiscounted_return() for ro in rand_trajs] ret_avg = np.mean(rets) ret_med = np.median(rets) ret_std = np.std(rets) self.logger.add_value('num rollouts', len(rand_trajs)) self.logger.add_value('avg rollout len', np.mean([ro.length for ro in rand_trajs])) self.logger.add_value('avg return', ret_avg) self.logger.add_value('median return', ret_med) self.logger.add_value('std return', ret_std) # Flatten and combine all randomized and reference trajectories for discriminator flattened_randomized = StepSequence.concat(rand_trajs) flattened_randomized.torch(data_type=to.double) flattened_reference = StepSequence.concat(ref_trajs) flattened_reference.torch(data_type=to.double) self.reward_generator.train(flattened_reference, flattened_randomized, self.num_discriminator_epoch) if self.curr_time_step > self.warm_up_time: # Update the particles # List of lists to comply with interface self.svpg.update(list(map(lambda x: [x], ros))) flattened_randomized.torch(data_type=to.double) flattened_randomized.observations = flattened_randomized.observations.double().detach() flattened_randomized.actions = flattened_randomized.actions.double().detach() # np.save(f'{self.save_dir}actions{self.curr_iter}', flattened_randomized.actions) self.make_snapshot(snapshot_mode, float(ret_avg), meta_info) self.subroutine.make_snapshot(snapshot_mode='best', curr_avg_ret=float(ret_avg)) self.curr_time_step += 1
) segment_nom.append(sn) if args.use_rec: check_act_equal(segment_real, sn, check_applied=hasattr(sn, "actions_applied")) # Pad if necessary StepSequence.pad(sn, segment_real.length) # Append individual segments segments_nom.append(segment_nom) assert len(segments_nom) == len(segments_ml_all) # Get the states for computing the performance metrics states_real = np.stack([ro.get_data_values(args.data_type, truncate_last=True) for ro in rollouts_real], axis=0) states_nom = np.stack( [StepSequence.concat(segs_nom).get_data_values(args.data_type) for segs_nom in segments_nom], axis=0 ) states_ml = np.stack( # index 0 ist the most likely [ StepSequence.concat([s[0] for s in [segs_ml for segs_ml in segments_ml]]).get_data_values(args.data_type) for segments_ml in segments_ml_all ], axis=0, ) assert states_real.shape == states_nom.shape == states_ml.shape assert states_real.shape[0] == num_rollouts_real # Compute the DTW and RMSE distance and store it in a table compute_traj_distance_metrics(states_real, states_ml, states_nom, num_rollouts_real) # Optionally masks out some states/observations and actions for plotting