def _ps_run_one_reset_kwargs_segment(
    G,
    domain_param: dict,
    init_state: np.ndarray,
    len_segment: int,
    stop_on_done: bool,
    use_rec: bool,
    idx_r: int,
    cnt_step: int,
    eval: bool,
):
    """
    Sample one segments of a rollout with given init state (which originates from a target domain setup) and domain
    parameters, passed as a tuple for simplicity at the other end.
    """
    if not isinstance(domain_param, dict):
        raise pyrado.TypeErr(given=domain_param, expected_type=dict)
    if not isinstance(init_state, np.ndarray):
        raise pyrado.TypeErr(given=init_state, expected_type=np.ndarray)
    if not isinstance(len_segment, int):
        raise pyrado.TypeErr(given=len_segment, expected_type=int)

    # Set the init space of the simulation environment such that we can later set to arbitrary states that could have
    # occurred during the rollout. This is necessary since we are running the evaluation in segments.
    G.env.init_space = InfBoxSpace(shape=G.env.init_space.shape)

    if use_rec:
        # Disabled the policy reset of PlaybackPolicy to do it here manually
        assert G.policy.no_reset
        G.policy.curr_rec = idx_r
        G.policy.curr_step = cnt_step

    ro = rollout(
        G.env,
        G.policy,
        eval=eval,
        reset_kwargs=dict(init_state=init_state, domain_param=domain_param),
        max_steps=len_segment,
        stop_on_done=stop_on_done,
    )

    # Pad if necessary
    StepSequence.pad(ro, len_segment)

    return ro
Beispiel #2
0
def test_stepsequence_padding(mock_data, data_format: str,
                              pad_value: Union[int, float], pad_len: int):
    # Create too short rollout
    rewards, states, observations, actions, hidden, policy_infos = mock_data
    ro = StepSequence(
        rewards=rewards,
        observations=observations,
        states=states,
        actions=actions,
        hidden=hidden,
        policy_infos=policy_infos,
    )
    len_orig = ro.length

    if data_format == "torch":
        ro.torch()

    # Pad it
    StepSequence.pad(ro, len_to_pad_to=len(ro) + pad_len, pad_value=pad_value)

    # Check
    ro.numpy()  # for simplified checking
    assert np.allclose(ro.states[len_orig + 1:],
                       pad_value * np.ones_like(ro.states[len_orig + 1:]))
    assert np.allclose(
        ro.observations[len_orig + 1:],
        pad_value * np.ones_like(ro.observations[len_orig + 1:]))
    assert np.allclose(ro.actions[len_orig:],
                       pad_value * np.ones_like(ro.actions[len_orig:]))
    assert np.allclose(ro.rewards[len_orig:],
                       pad_value * np.ones_like(ro.rewards[len_orig:]))
    for k, v in ro.policy_infos.items():
        assert np.allclose(v[len_orig:],
                           pad_value * np.ones_like(v[len_orig:]))

    assert ro.length == len_orig + pad_len
    assert all(ro.rollout_bounds == np.array([0, len_orig + pad_len]))

    assert len(ro.states) == len_orig + 8  # check for final step
    assert len(ro.observations) == len_orig + 8  # check for final step
    assert len(ro.actions) == len_orig + pad_len
    assert len(ro.rewards) == len_orig + pad_len
    for h in ro.hidden:
        assert len(h) == len_orig + pad_len
    def __call__(self, dp_values: to.Tensor = None) -> Tuple[to.Tensor, StepSequence]:
        """
        Run one rollout in the target domain, and compute the features of the data used for sbi.

        :param dp_values: ignored, just here for the interface compatibility
        :return: features computed from the time series data, and the complete rollout
        """
        ro_real = None
        run_interactive_loop = True
        while run_interactive_loop:
            # Don't set the domain params here since they are set by the DomainRandWrapperBuffer to mimic the randomness
            ro_real = rollout(self._env, self._policy, eval=True, stop_on_done=self.stop_on_done)
            if not isinstance(self._env, RealEnv):
                run_interactive_loop = False
            else:
                # Ask is the current rollout should be discarded and redone
                run_interactive_loop = input("Continue with the next rollout y / n? ").lower() == "n"

        # Pad if necessary
        StepSequence.pad(ro_real, self._env.max_steps)

        # Pre-processing
        ro_real.torch()
        self._set_action_field([ro_real])

        # Assemble the data
        data_real = to.cat([ro_real.states[:-1, :], ro_real.get_data_values(self._action_field)], dim=1)
        if self._embedding.requires_target_domain_data:
            data_real = to.cat([data_real, data_real], dim=1)

        # Compute the features
        data_real = data_real.unsqueeze(0)  # only one target domain rollout
        data_real = self._embedding(Embedding.pack(data_real))  # shape [1, dim_feat]

        # Check shape (here no batching and always one rollout)
        if data_real.shape[0] != 1 or data_real.ndim != 2:
            raise pyrado.ShapeErr(given=data_real, expected_match=(1, -1))

        return data_real, ro_real
                    sdp = rollout(
                        env_sim,
                        policy,
                        eval=True,
                        reset_kwargs=dict(init_state=segment_real.states[0, :], domain_param=domain_param),
                        max_steps=segment_real.length,
                        stop_on_done=algo.stop_on_done,
                    )
                    segments_dp.append(sdp)

                    assert np.allclose(sdp.states[0, :], segment_real.states[0, :])
                    if args.use_rec:
                        check_act_equal(segment_real, sdp, check_applied=hasattr(sdp, "actions_applied"))

                    # Pad if necessary
                    StepSequence.pad(sdp, segment_real.length)

                # Increase step counter for next segment, and append all domain parameter segments
                cnt_step += segment_real.length
                segments_ml.append(segments_dp)

            # Append all segments for the current target domain rollout
            segments_ml_all.append(segments_ml)

    assert len(segments_ml_all) == len(segments_real_all)

    # Sample rollouts using the nominal domain parameters
    if args.use_rec:
        policy.reset_curr_rec()
    env_sim.domain_param = env_sim.get_nominal_domain_param()
    segments_nom = []
    def __call__(self, dp_values: to.Tensor) -> to.Tensor:
        """
        Run one rollout for every domain parameter set. The rollouts are done in segments, and after every segment the
        simulation state is set to the current state in the target domain rollout.

        :param dp_values: tensor containing domain parameters along the 1st dimension
        :return: features computed from the time series data
        """
        dp_values = to.atleast_2d(dp_values).numpy()

        if self.rollouts_real is not None:
            if self.use_rec_act:
                # Create a policy that simply replays the recorded actions
                self._set_action_field(self.rollouts_real)
                policy = PlaybackPolicy(
                    self._env.spec,
                    [ro.get_data_values(self._action_field) for ro in self.rollouts_real],
                    no_reset=True,
                )
            else:
                # Use the current policy to generate the actions
                policy = self._policy

            # The initial states will be set to states which will most likely not the be in the initial state space of
            # the environment, thus we set the initial state space to an infinite space
            self._env.init_space = BoxSpace(
                -pyrado.inf, pyrado.inf, self._env.state_space.shape, labels=self._env.state_space.labels
            )

            data_sim_all = []  # for all target domain rollouts

            # Iterate over domain parameter sets
            for dp_value in dp_values:
                data_sim_one_dp = []  # for all target domain rollouts of one domain parameter set

                # Iterate over target domain rollouts
                for idx_r, ro_real in enumerate(self.rollouts_real):
                    data_one_ro = []
                    ro_real.numpy()

                    # Split the target domain rollout if desired
                    if self.num_segments is not None:
                        segs_real = list(ro_real.split_ordered_batches(num_batches=self.num_segments))
                    else:
                        segs_real = list(ro_real.split_ordered_batches(batch_size=self.len_segments))

                    # Iterate over segments of one target domain rollout
                    cnt_step = 0
                    for seg_real in segs_real:
                        if self.use_rec_act:
                            # Disabled the policy reset of PlaybackPolicy to do it here manually
                            assert policy.no_reset
                            policy.curr_rec = idx_r
                            policy.curr_step = cnt_step

                        # Do the rollout for a segment
                        seg_sim = rollout(
                            self._env,
                            policy,
                            eval=True,
                            reset_kwargs=dict(
                                init_state=seg_real.states[0, :], domain_param=dict(zip(self.dp_names, dp_value))
                            ),
                            stop_on_done=self.stop_on_done,
                            max_steps=seg_real.length,
                        )
                        check_domain_params(seg_sim, dp_value, self.dp_names)
                        if self.use_rec_act:
                            check_act_equal(seg_real, seg_sim, check_applied=self._action_field == "actions_applied")

                        # Pad if necessary
                        StepSequence.pad(seg_sim, seg_real.length)

                        # Increase step counter for next segment
                        cnt_step += seg_real.length

                        # Concatenate states and actions of the simulated and real segments
                        data_one_seg = np.concatenate(
                            [
                                seg_sim.states[: len(seg_real), :],
                                seg_sim.get_data_values(self._action_field)[: len(seg_real), :],
                            ],
                            axis=1,
                        )
                        if self._embedding.requires_target_domain_data:
                            # The embedding is also using target domain data (the case for DTW distance)
                            data_one_seg_real = np.concatenate(
                                [seg_real.states[: len(seg_real), :], seg_real.get_data_values(self._action_field)],
                                axis=1,
                            )
                            data_one_seg = np.concatenate([data_one_seg, data_one_seg_real], axis=1)
                        data_one_seg = to.from_numpy(data_one_seg).to(dtype=to.get_default_dtype())
                        data_one_ro.append(data_one_seg)

                    # Append one simulated rollout
                    data_sim_one_dp.append(to.cat(data_one_ro, dim=0))

                # Append the segments of all target domain rollouts for the current domain parameter
                data_sim_all.append(to.stack(data_sim_one_dp, dim=0))

            # Compute the features from all time series
            data_sim_all = to.stack(data_sim_all, dim=0)  # shape [batch_size, num_rollouts, len_time_series, dim_data]
            data_sim_all = self._embedding(Embedding.pack(data_sim_all))  # shape [batch_size, num_rollouts * dim_data]

            # Check shape
            if data_sim_all.shape != (dp_values.shape[0], len(self.rollouts_real) * self._embedding.dim_output):
                raise pyrado.ShapeErr(
                    given=data_sim_all,
                    expected_match=(dp_values.shape[0], len(self.rollouts_real) * self._embedding.dim_output),
                )

        else:
            # There are no pre-recorded rollouts, e.g. during _setup_sbi().
            # Use the current policy yo generate the actions.
            policy = self._policy

            # Do the rollouts
            data_sim_all = []
            for dp_value in dp_values:
                ro_sim = rollout(
                    self._env,
                    policy,
                    eval=True,
                    reset_kwargs=dict(domain_param=dict(zip(self.dp_names, dp_value))),
                    stop_on_done=self.stop_on_done,
                )
                check_domain_params(ro_sim, dp_value, self.dp_names)

                # Pad if necessary
                StepSequence.pad(ro_sim, self._env.max_steps)

                # Concatenate states and actions of the simulated segments
                data_one_seg = np.concatenate(
                    [ro_sim.states[:-1, :], ro_sim.get_data_values(self._action_field)], axis=1
                )
                if self._embedding.requires_target_domain_data:
                    data_one_seg = np.concatenate([data_one_seg, data_one_seg], axis=1)
                data_one_seg = to.from_numpy(data_one_seg).to(dtype=to.get_default_dtype())
                data_sim_all.append(data_one_seg)

            # Compute the features from all time series
            data_sim_all = to.stack(data_sim_all, dim=0)
            data_sim_all = data_sim_all.unsqueeze(1)  # equivalent to only one target domain rollout
            data_sim_all = self._embedding(Embedding.pack(data_sim_all))  # shape [batch_size,  dim_feat]

            # Check shape
            if data_sim_all.shape != (dp_values.shape[0], self._embedding.dim_output):
                raise pyrado.ShapeErr(
                    given=data_sim_all, expected_match=(dp_values.shape[0], self._embedding.dim_output)
                )

        return data_sim_all  # shape [batch_size, num_rollouts * dim_feat]