def _ps_run_one_reset_kwargs_segment( G, domain_param: dict, init_state: np.ndarray, len_segment: int, stop_on_done: bool, use_rec: bool, idx_r: int, cnt_step: int, eval: bool, ): """ Sample one segments of a rollout with given init state (which originates from a target domain setup) and domain parameters, passed as a tuple for simplicity at the other end. """ if not isinstance(domain_param, dict): raise pyrado.TypeErr(given=domain_param, expected_type=dict) if not isinstance(init_state, np.ndarray): raise pyrado.TypeErr(given=init_state, expected_type=np.ndarray) if not isinstance(len_segment, int): raise pyrado.TypeErr(given=len_segment, expected_type=int) # Set the init space of the simulation environment such that we can later set to arbitrary states that could have # occurred during the rollout. This is necessary since we are running the evaluation in segments. G.env.init_space = InfBoxSpace(shape=G.env.init_space.shape) if use_rec: # Disabled the policy reset of PlaybackPolicy to do it here manually assert G.policy.no_reset G.policy.curr_rec = idx_r G.policy.curr_step = cnt_step ro = rollout( G.env, G.policy, eval=eval, reset_kwargs=dict(init_state=init_state, domain_param=domain_param), max_steps=len_segment, stop_on_done=stop_on_done, ) # Pad if necessary StepSequence.pad(ro, len_segment) return ro
def test_stepsequence_padding(mock_data, data_format: str, pad_value: Union[int, float], pad_len: int): # Create too short rollout rewards, states, observations, actions, hidden, policy_infos = mock_data ro = StepSequence( rewards=rewards, observations=observations, states=states, actions=actions, hidden=hidden, policy_infos=policy_infos, ) len_orig = ro.length if data_format == "torch": ro.torch() # Pad it StepSequence.pad(ro, len_to_pad_to=len(ro) + pad_len, pad_value=pad_value) # Check ro.numpy() # for simplified checking assert np.allclose(ro.states[len_orig + 1:], pad_value * np.ones_like(ro.states[len_orig + 1:])) assert np.allclose( ro.observations[len_orig + 1:], pad_value * np.ones_like(ro.observations[len_orig + 1:])) assert np.allclose(ro.actions[len_orig:], pad_value * np.ones_like(ro.actions[len_orig:])) assert np.allclose(ro.rewards[len_orig:], pad_value * np.ones_like(ro.rewards[len_orig:])) for k, v in ro.policy_infos.items(): assert np.allclose(v[len_orig:], pad_value * np.ones_like(v[len_orig:])) assert ro.length == len_orig + pad_len assert all(ro.rollout_bounds == np.array([0, len_orig + pad_len])) assert len(ro.states) == len_orig + 8 # check for final step assert len(ro.observations) == len_orig + 8 # check for final step assert len(ro.actions) == len_orig + pad_len assert len(ro.rewards) == len_orig + pad_len for h in ro.hidden: assert len(h) == len_orig + pad_len
def __call__(self, dp_values: to.Tensor = None) -> Tuple[to.Tensor, StepSequence]: """ Run one rollout in the target domain, and compute the features of the data used for sbi. :param dp_values: ignored, just here for the interface compatibility :return: features computed from the time series data, and the complete rollout """ ro_real = None run_interactive_loop = True while run_interactive_loop: # Don't set the domain params here since they are set by the DomainRandWrapperBuffer to mimic the randomness ro_real = rollout(self._env, self._policy, eval=True, stop_on_done=self.stop_on_done) if not isinstance(self._env, RealEnv): run_interactive_loop = False else: # Ask is the current rollout should be discarded and redone run_interactive_loop = input("Continue with the next rollout y / n? ").lower() == "n" # Pad if necessary StepSequence.pad(ro_real, self._env.max_steps) # Pre-processing ro_real.torch() self._set_action_field([ro_real]) # Assemble the data data_real = to.cat([ro_real.states[:-1, :], ro_real.get_data_values(self._action_field)], dim=1) if self._embedding.requires_target_domain_data: data_real = to.cat([data_real, data_real], dim=1) # Compute the features data_real = data_real.unsqueeze(0) # only one target domain rollout data_real = self._embedding(Embedding.pack(data_real)) # shape [1, dim_feat] # Check shape (here no batching and always one rollout) if data_real.shape[0] != 1 or data_real.ndim != 2: raise pyrado.ShapeErr(given=data_real, expected_match=(1, -1)) return data_real, ro_real
sdp = rollout( env_sim, policy, eval=True, reset_kwargs=dict(init_state=segment_real.states[0, :], domain_param=domain_param), max_steps=segment_real.length, stop_on_done=algo.stop_on_done, ) segments_dp.append(sdp) assert np.allclose(sdp.states[0, :], segment_real.states[0, :]) if args.use_rec: check_act_equal(segment_real, sdp, check_applied=hasattr(sdp, "actions_applied")) # Pad if necessary StepSequence.pad(sdp, segment_real.length) # Increase step counter for next segment, and append all domain parameter segments cnt_step += segment_real.length segments_ml.append(segments_dp) # Append all segments for the current target domain rollout segments_ml_all.append(segments_ml) assert len(segments_ml_all) == len(segments_real_all) # Sample rollouts using the nominal domain parameters if args.use_rec: policy.reset_curr_rec() env_sim.domain_param = env_sim.get_nominal_domain_param() segments_nom = []
def __call__(self, dp_values: to.Tensor) -> to.Tensor: """ Run one rollout for every domain parameter set. The rollouts are done in segments, and after every segment the simulation state is set to the current state in the target domain rollout. :param dp_values: tensor containing domain parameters along the 1st dimension :return: features computed from the time series data """ dp_values = to.atleast_2d(dp_values).numpy() if self.rollouts_real is not None: if self.use_rec_act: # Create a policy that simply replays the recorded actions self._set_action_field(self.rollouts_real) policy = PlaybackPolicy( self._env.spec, [ro.get_data_values(self._action_field) for ro in self.rollouts_real], no_reset=True, ) else: # Use the current policy to generate the actions policy = self._policy # The initial states will be set to states which will most likely not the be in the initial state space of # the environment, thus we set the initial state space to an infinite space self._env.init_space = BoxSpace( -pyrado.inf, pyrado.inf, self._env.state_space.shape, labels=self._env.state_space.labels ) data_sim_all = [] # for all target domain rollouts # Iterate over domain parameter sets for dp_value in dp_values: data_sim_one_dp = [] # for all target domain rollouts of one domain parameter set # Iterate over target domain rollouts for idx_r, ro_real in enumerate(self.rollouts_real): data_one_ro = [] ro_real.numpy() # Split the target domain rollout if desired if self.num_segments is not None: segs_real = list(ro_real.split_ordered_batches(num_batches=self.num_segments)) else: segs_real = list(ro_real.split_ordered_batches(batch_size=self.len_segments)) # Iterate over segments of one target domain rollout cnt_step = 0 for seg_real in segs_real: if self.use_rec_act: # Disabled the policy reset of PlaybackPolicy to do it here manually assert policy.no_reset policy.curr_rec = idx_r policy.curr_step = cnt_step # Do the rollout for a segment seg_sim = rollout( self._env, policy, eval=True, reset_kwargs=dict( init_state=seg_real.states[0, :], domain_param=dict(zip(self.dp_names, dp_value)) ), stop_on_done=self.stop_on_done, max_steps=seg_real.length, ) check_domain_params(seg_sim, dp_value, self.dp_names) if self.use_rec_act: check_act_equal(seg_real, seg_sim, check_applied=self._action_field == "actions_applied") # Pad if necessary StepSequence.pad(seg_sim, seg_real.length) # Increase step counter for next segment cnt_step += seg_real.length # Concatenate states and actions of the simulated and real segments data_one_seg = np.concatenate( [ seg_sim.states[: len(seg_real), :], seg_sim.get_data_values(self._action_field)[: len(seg_real), :], ], axis=1, ) if self._embedding.requires_target_domain_data: # The embedding is also using target domain data (the case for DTW distance) data_one_seg_real = np.concatenate( [seg_real.states[: len(seg_real), :], seg_real.get_data_values(self._action_field)], axis=1, ) data_one_seg = np.concatenate([data_one_seg, data_one_seg_real], axis=1) data_one_seg = to.from_numpy(data_one_seg).to(dtype=to.get_default_dtype()) data_one_ro.append(data_one_seg) # Append one simulated rollout data_sim_one_dp.append(to.cat(data_one_ro, dim=0)) # Append the segments of all target domain rollouts for the current domain parameter data_sim_all.append(to.stack(data_sim_one_dp, dim=0)) # Compute the features from all time series data_sim_all = to.stack(data_sim_all, dim=0) # shape [batch_size, num_rollouts, len_time_series, dim_data] data_sim_all = self._embedding(Embedding.pack(data_sim_all)) # shape [batch_size, num_rollouts * dim_data] # Check shape if data_sim_all.shape != (dp_values.shape[0], len(self.rollouts_real) * self._embedding.dim_output): raise pyrado.ShapeErr( given=data_sim_all, expected_match=(dp_values.shape[0], len(self.rollouts_real) * self._embedding.dim_output), ) else: # There are no pre-recorded rollouts, e.g. during _setup_sbi(). # Use the current policy yo generate the actions. policy = self._policy # Do the rollouts data_sim_all = [] for dp_value in dp_values: ro_sim = rollout( self._env, policy, eval=True, reset_kwargs=dict(domain_param=dict(zip(self.dp_names, dp_value))), stop_on_done=self.stop_on_done, ) check_domain_params(ro_sim, dp_value, self.dp_names) # Pad if necessary StepSequence.pad(ro_sim, self._env.max_steps) # Concatenate states and actions of the simulated segments data_one_seg = np.concatenate( [ro_sim.states[:-1, :], ro_sim.get_data_values(self._action_field)], axis=1 ) if self._embedding.requires_target_domain_data: data_one_seg = np.concatenate([data_one_seg, data_one_seg], axis=1) data_one_seg = to.from_numpy(data_one_seg).to(dtype=to.get_default_dtype()) data_sim_all.append(data_one_seg) # Compute the features from all time series data_sim_all = to.stack(data_sim_all, dim=0) data_sim_all = data_sim_all.unsqueeze(1) # equivalent to only one target domain rollout data_sim_all = self._embedding(Embedding.pack(data_sim_all)) # shape [batch_size, dim_feat] # Check shape if data_sim_all.shape != (dp_values.shape[0], self._embedding.dim_output): raise pyrado.ShapeErr( given=data_sim_all, expected_match=(dp_values.shape[0], self._embedding.dim_output) ) return data_sim_all # shape [batch_size, num_rollouts * dim_feat]