Beispiel #1
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=256,
        reward_multiplier=1,
        lr=5e-3,
    )
    policy = FNNPolicy(reference_env.spec,
                       hidden_sizes=[16, 16],
                       hidden_nonlin=to.tanh)
    dr = create_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(fmt="dict", dtype="numpy")
    reference_sampler = ParallelRolloutSampler(reference_env,
                                               policy,
                                               num_workers=1,
                                               min_steps=1000)
    random_sampler = ParallelRolloutSampler(random_env,
                                            policy,
                                            num_workers=1,
                                            min_steps=1000)

    losses = []
    for i in range(200):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
Beispiel #2
0
    def push(self, ros: Union[list, StepSequence], truncate_last: bool = True):
        """
        Save a sequence of steps and drop of steps if the capacity is exceeded.

        :param ros: list of rollouts or one concatenated rollout
        :param truncate_last: remove the last step from each rollout, forwarded to `StepSequence.concat`
        """
        if isinstance(ros, list):
            # Concatenate given rollouts if necessary
            ros = StepSequence.concat(ros)
        elif isinstance(ros, StepSequence):
            pass
        else:
            pyrado.TypeErr(given=ros, expected_type=[list, StepSequence])

        # Add new steps
        if self.isempty:
            self._memory = deepcopy(ros)  # on the very first call
        else:
            self._memory = StepSequence.concat([self._memory, ros], truncate_last=truncate_last)

        num_surplus = self._memory.length - self.capacity
        if num_surplus > 0:
            # Drop surplus of old steps
            self._memory = self._memory[num_surplus:]
Beispiel #3
0
def test_concat(data_format):
    # Create some rollouts with random rewards
    ros = [
        StepSequence(rewards=np.random.randn(5),
                     observations=np.random.randn(6),
                     actions=np.random.randn(5),
                     policy_infos={'mean': np.random.randn(5)},
                     hidden=(np.random.randn(5), np.random.randn(5)),
                     data_format=data_format),
        StepSequence(rewards=np.random.randn(5),
                     observations=np.random.randn(6),
                     actions=np.random.randn(5),
                     policy_infos={'mean': np.random.randn(5)},
                     hidden=(np.random.randn(5), np.random.randn(5)),
                     data_format=data_format)
    ]

    # Perform concatenation
    cat = StepSequence.concat(ros)

    assert cat.continuous
    assert cat.rollout_count == 2

    # Check steps
    for step_ro, step_cat in zip(itertools.chain.from_iterable(ros), cat):
        assert step_ro.reward == step_cat.reward
        assert step_ro.observation == step_cat.observation
        assert step_ro.done == step_cat.done
Beispiel #4
0
def test_replay_memory(capacity):
    rm = ReplayMemory(capacity)

    # Create fake rollouts (of length 5)
    ro1 = StepSequence(rewards=rewards,
                       observations=observations,
                       actions=actions,
                       hidden=hidden)
    ro2 = StepSequence(rewards=rewards,
                       observations=observations,
                       actions=actions,
                       hidden=hidden)
    # Concatenate them for testing only
    ros = StepSequence.concat(
        [ro1, ro2],
        truncate_last=True)  # same truncate_last behavior as push function

    # Check the lengths
    rm.push(ro1)
    assert len(rm) == len(ro1) or len(rm) == capacity
    rm.push(ro2)
    assert len(rm) == len(ro1) + len(ro1) or len(rm) == capacity

    # Check the elements
    shift = len(ros) - capacity
    if shift < len(ro1):
        assert all(rm.memory.observations[0] == ros.observations[shift])
    assert all(rm.memory.observations[-1] ==
               ro2.observations[-2])  # -2 since one was truncated
Beispiel #5
0
    def update(self, *args: Any, **kwargs: Any):
        """Update the policy's (and value functions') parameters based on the collected rollout data."""
        obss = []
        losses = []
        for t in range(self.num_teachers):
            concat_ros = StepSequence.concat(kwargs["rollouts"][t])
            concat_ros.torch(data_type=to.get_default_dtype())
            obss.append(concat_ros.get_data_values("observations")[: self.min_steps])

        # Train student
        for epoch in range(self.num_epochs):
            self.optimizer.zero_grad()

            loss = 0
            for t_idx, teacher in enumerate(self.teacher_policies):
                s_dist = self.expl_strat.action_dist_at(self.policy(obss[t_idx]))
                s_act = s_dist.sample()
                t_dist = self.teacher_expl_strats[t_idx].action_dist_at(teacher(obss[t_idx]))

                l = self.teacher_weights[t_idx] * self.criterion(t_dist.log_prob(s_act), s_dist.log_prob(s_act))
                loss += l
                losses.append([t_idx, l.item()])
            print(f"Epoch {epoch} Loss: {loss.item()}")
            loss.backward()
            self.optimizer.step()
Beispiel #6
0
def test_basic_policy_evaluate_packed_padded_sequences(
        env: Env, policy: RecurrentPolicy):
    # Test packed padded sequence implementation against old implementation
    def old_evaluate(rollout: StepSequence,
                     hidden_states_name: str = "hidden_states") -> to.Tensor:
        # Set policy, i.e. PyTorch nn.Module, to evaluation mode
        policy.eval()

        # The passed sample collection might contain multiple rollouts.
        act_list = []
        for ro in rollout.iterate_rollouts():
            if hidden_states_name in rollout.data_names:
                # Get initial hidden state from first step
                hidden = policy._unpack_hidden(ro[0][hidden_states_name])
            else:
                # Let the network pick the default hidden state
                hidden = None

            # Reshape observations to match PyTorch's RNN sequence protocol
            obs = ro.get_data_values("observations", True).unsqueeze(1)
            obs = obs.to(device=policy.device, dtype=to.get_default_dtype())

            # Pass the input through hidden RNN layers
            out, _ = policy.rnn_layers(obs, hidden)

            # And through the output layer
            act = policy.output_layer(out.squeeze(1))
            if policy.output_nonlin is not None:
                act = policy.output_nonlin(act)

            # Collect the actions
            act_list.append(act)

        # Set policy, i.e. PyTorch nn.Module, back to training mode
        policy.train()

        return to.cat(act_list)

    # Get some rollouts
    ros = []
    for i in range(5):
        ro = rollout(env, policy, eval=True, render_mode=RenderMode())
        ro.torch(to.get_default_dtype())
        ros.append(ro)

    # Perform concatenation
    cat = StepSequence.concat(ros)

    # Evaluate old and new approaches
    act_old = old_evaluate(cat)
    act_new = policy.evaluate(cat)

    to.testing.assert_allclose(act_old, act_new)
Beispiel #7
0
def test_adr_reward_generator(env):
    reference_env = env
    random_env = deepcopy(env)
    reward_generator = RewardGenerator(
        env_spec=random_env.spec,
        batch_size=100,
        reward_multiplier=1,
        logger=None
    )
    policy = FNNPolicy(reference_env.spec, hidden_sizes=[32], hidden_nonlin=to.tanh)
    dr = get_default_randomizer_omo()
    dr.randomize(num_samples=1)
    random_env.domain_param = dr.get_params(format='dict', dtype='numpy')
    reference_sampler = ParallelSampler(reference_env, policy, num_envs=4, min_steps=10000)
    random_sampler = ParallelSampler(random_env, policy, num_envs=4, min_steps=10000)

    losses = []
    for i in range(50):
        reference_traj = StepSequence.concat(reference_sampler.sample())
        random_traj = StepSequence.concat(random_sampler.sample())
        losses.append(reward_generator.train(reference_traj, random_traj, 10))
    assert losses[len(losses) - 1] < losses[0]
Beispiel #8
0
    def update(self, rollouts: Sequence[StepSequence]):
        r"""
        Train the particles $mu$.

        :param rollouts: rewards collected from the rollout
        """
        policy_grads = []
        parameters = []

        for i in range(self.num_particles):
            # Get the rollouts associated to the i-th particle
            concat_ros = StepSequence.concat(rollouts[i])
            concat_ros.torch()

            act_stats = compute_action_statistics(concat_ros,
                                                  self.expl_strats[i])
            act_stats_fixed = compute_action_statistics(
                concat_ros, self.fixed_expl_strats[i])

            klds = to.distributions.kl_divergence(act_stats.act_distr,
                                                  act_stats_fixed.act_distr)
            entropy = act_stats.act_distr.entropy()
            log_prob = act_stats.log_probs

            concat_ros.rewards = concat_ros.rewards - (
                0.1 * klds.mean(1)).view(-1) - 0.1 * entropy.mean(1).view(-1)

            # Update the advantage estimator's parameters and return advantage estimates
            adv = self.particles[i].critic.update(rollouts[i],
                                                  use_empirical_returns=True)

            # Estimate policy gradients
            self.optimizers[i].zero_grad()
            policy_grad = -to.mean(log_prob * adv.detach())
            policy_grad.backward()  # step comes later than usual

            # Collect flattened parameter and gradient vectors
            policy_grads.append(self.expl_strats[i].param_grad)
            parameters.append(self.expl_strats[i].param_values)

        parameters = to.stack(parameters)
        policy_grads = to.stack(policy_grads)
        Kxx, dx_Kxx = self.kernel(parameters)
        grad_theta = (to.mm(Kxx, policy_grads / self.temperature) +
                      dx_Kxx) / self.num_particles

        for i in range(self.num_particles):
            self.expl_strats[i].param_grad = grad_theta[i]
            self.optimizers[i].step()
        self.updatecount += 1
Beispiel #9
0
def test_potential_policy_evaluate_packed_padded_sequences(
        env: Env, policy: RecurrentPolicy):
    # Test packed padded sequence implementation for custom recurrent neural networks

    # Get some rollouts
    ros = []
    for i in range(5):
        ro = rollout(env, policy, eval=True, render_mode=RenderMode())
        ro.torch(to.get_default_dtype())
        ros.append(ro)

    # Perform concatenation
    cat = StepSequence.concat(ros)

    # Evaluate old and new approaches
    act_new = policy.evaluate(cat)
    assert act_new is not None
Beispiel #10
0
def test_twoheaded_policy_evaluate_packed_padded_sequences(
        env: Env, policy: RecurrentPolicy):
    # Test packed padded sequence implementation for custom recurrent neural networks
    def old_evaluate(rollout: StepSequence,
                     hidden_states_name: str = "hidden_states") -> to.Tensor:
        # Set policy, i.e. PyTorch nn.Module, to evaluation mode
        policy.eval()

        act_list = []
        head2_list = []
        for ro in rollout.iterate_rollouts():
            if hidden_states_name in rollout.data_names:
                # Get initial hidden state from first step
                hidden = ro[0][hidden_states_name]
            else:
                # Let the network pick the default hidden state
                hidden = None
            # Run steps consecutively reusing the hidden state
            for step in ro:
                act, head2, hidden = policy(step.observation, hidden)
                act_list.append(act)
                head2_list.append(head2)

        # Set policy, i.e. PyTorch nn.Module, back to training mode
        policy.train()

        return to.stack(act_list), to.stack(head2_list)

    # Get some rollouts
    ros = []
    for i in range(5):
        ro = rollout(env, policy, eval=True, render_mode=RenderMode())
        ro.torch(to.get_default_dtype())
        ros.append(ro)

    # Perform concatenation
    cat = StepSequence.concat(ros)

    # Evaluate old and new approaches
    output_1_old, output_2_old = old_evaluate(cat)
    output_1_new, output_2_new = policy.evaluate(cat)

    to.testing.assert_allclose(output_1_old, output_1_new)
    to.testing.assert_allclose(output_2_old, output_2_new)
Beispiel #11
0
def test_concat_rollouts(env, expl_strat):
    ro1 = rollout(env, expl_strat)
    ro2 = rollout(env, expl_strat)
    ro_cat = StepSequence.concat([ro1, ro2])
    assert isinstance(ro_cat, StepSequence)
    assert ro_cat.length == ro1.length + ro2.length
Beispiel #12
0
def plot_rollouts_segment_wise(
        plot_type: str,
        segments_ground_truth: List[List[StepSequence]],
        segments_multiple_envs: List[List[List[StepSequence]]],
        segments_nominal: List[List[StepSequence]],
        use_rec_str: bool,
        idx_iter: int,
        idx_round: int,
        state_labels: Optional[Iterable[str]] = None,
        act_labels: Optional[Iterable[str]] = None,
        x_limits: Optional[Tuple[int]] = None,
        plot_act: bool = False,
        data_field: str = "states",
        cmap_samples: Optional[colors.Colormap] = None,
        save_dir: Optional[pyrado.PathLike] = None,
        file_format: Iterable[str] = ("pdf", "pgf", "png"),
) -> List[plt.Figure]:
    r"""
    Plot the different rollouts in separate figures and the different state dimensions along the columns.

    :param plot_type: type of plot, pass "samples" to plot the rollouts of the most likely domain parameters as
                      individual lines, or pass "confidence" to plot the most likely one, and the mean $\pm$ 1 std
    :param segments_ground_truth: list of lists containing rollout segments from the ground truth environment
    :param segments_multiple_envs: list of lists of lists containing rollout segments from different environment
                                   instances, e.g. samples from a posterior coming from `NDPR`
    :param segments_nominal: list of lists containing rollout segments from the nominal environment
    :param use_rec_str: `True` if pre-recorded actions have been used to generate the rollouts
    :param idx_iter: selected iteration
    :param idx_round: selected round
    :param state_labels: y-axes labels to for the state trajectories, no label by default
    :param act_labels: y-axes labels to for the action trajectories, no label by default
    :param x_limits: tuple containing the lower and upper limits for the x-axis
    :param plot_act: if `True`, also plot the actions
    :param data_field: data field of the rollout, e.g. "states" or "observations"
    :param cmap_samples: color map for the trajectories resulting from different domain parameter samples
    :param save_dir: if not `None` create a subfolder plots in `save_dir` and save the plots in there
    :param file_format: select the file format to store the plots
    :return: list of handles to the created figures
    """
    if plot_type not in ["samples", "confidence"]:
        raise pyrado.ValueErr(given=plot_type,
                              eq_constraint="samples or confidence")
    if data_field not in ["states", "observations"]:
        raise pyrado.ValueErr(given=data_field,
                              eq_constraint="states or observations")

    # Extract the state dimension, and the number of most likely samples from the data
    dim_state = segments_ground_truth[0][0].get_data_values(data_field)[
        0, :].size
    dim_act = segments_ground_truth[0][0].get_data_values("actions")[0, :].size
    num_samples = len(segments_multiple_envs[0][0])

    # Validate the labels
    if state_labels is None:
        state_labels = [""] * dim_state
    else:
        if len(state_labels) != dim_state:
            raise pyrado.ShapeErr(given=state_labels,
                                  expected_match=(dim_state, ))
    if act_labels is None:
        act_labels = [""] * dim_act
    else:
        if len(act_labels) != dim_act:
            raise pyrado.ShapeErr(given=act_labels, expected_match=(dim_act, ))

    if cmap_samples is None:
        cmap_samples = plt.get_cmap("Reds")(np.linspace(0.6, 0.8, num_samples))
    fig_list = []
    label_samples = "ml" if plot_type == "confidence" else "samples"

    # Plot
    for idx_r in range(len(segments_ground_truth)):
        num_rows = dim_state + dim_act if plot_act else dim_state
        fig, axs = plt.subplots(nrows=num_rows,
                                figsize=(16, 9),
                                tight_layout=True,
                                sharex="col")
        axs = np.atleast_1d(axs)

        # Plot the states
        for idx_state in range(dim_state):
            # Plot the real segments
            cnt_step = [0]
            for segment_gt in segments_ground_truth[idx_r]:
                axs[idx_state].plot(
                    np.arange(cnt_step[-1], cnt_step[-1] + segment_gt.length),
                    segment_gt.get_data_values(data_field,
                                               truncate_last=True)[:,
                                                                   idx_state],
                    zorder=0,
                    c="black",
                    lw=1.0,
                    label="target" if cnt_step[-1] == 0 else "",  # print once
                )
                cnt_step.append(cnt_step[-1] + segment_gt.length)

            # Plot the maximum likely simulated segments
            for idx_seg, sml in enumerate(segments_multiple_envs[idx_r]):
                for idx_dp, sdp in enumerate(sml):
                    axs[idx_state].plot(
                        np.arange(cnt_step[idx_seg],
                                  cnt_step[idx_seg] + sdp.length),
                        sdp.get_data_values(data_field,
                                            truncate_last=True)[:, idx_state],
                        zorder=2 if idx_dp == 0 else 0,  # most likely on top
                        c=cmap_samples[idx_dp],
                        ls="-",
                        lw=1.5
                        if plot_type == "confidence" or idx_dp == 0 else 0.5,
                        alpha=1.0
                        if plot_type == "confidence" or idx_dp == 0 else 0.1,
                        label=label_samples
                        if cnt_step[idx_seg] == idx_seg == idx_dp == 0 else
                        "",  # print once
                    )
                    if plot_type != "samples":
                        # Stop here, unless the rollouts of most likely all domain parameters should be plotted
                        break

            if plot_type == "confidence":
                assert check_all_lengths_equal(
                    segments_multiple_envs[idx_r]
                    [0])  # all segments need to be equally long

                states_all = []
                for idx_dp in range(num_samples):
                    # Reconstruct the step sequences for all domain parameters
                    ss_dp = StepSequence.concat([
                        seg_dp[idx_dp] for seg_dp in
                        [segs_ro for segs_ro in segments_multiple_envs[idx_r]]
                    ])
                    states = ss_dp.get_data_values(data_field,
                                                   truncate_last=True)
                    states_all.append(states)
                states_all = np.stack(states_all, axis=0)
                states_mean = np.mean(states_all, axis=0)
                states_std = np.std(states_all, axis=0)

                for idx_seg in range(len(segments_multiple_envs[idx_r])):
                    len_segs = min([
                        len(seg)
                        for seg in segments_multiple_envs[idx_r][idx_seg]
                    ])  # use shortest
                    m_i = states_mean[cnt_step[idx_seg]:cnt_step[idx_seg] +
                                      len_segs, idx_state]
                    s_i = states_std[cnt_step[idx_seg]:cnt_step[idx_seg] +
                                     len_segs, idx_state]
                    draw_curve(
                        "std",
                        axs[idx_state],
                        pd.DataFrame(dict(mean=m_i, std=s_i)),
                        x_grid=np.arange(cnt_step[idx_seg],
                                         cnt_step[idx_seg] + len_segs),
                        show_legend=False,
                        area_label="2 std" if idx_seg == 0 else None,
                        plot_kwargs=dict(color=cmap_samples[0]),
                    )

            # Plot the nominal simulation's segments
            for idx_seg, sn in enumerate(segments_nominal[idx_r]):
                axs[idx_state].plot(
                    np.arange(cnt_step[idx_seg],
                              cnt_step[idx_seg] + sn.length),
                    sn.get_data_values(data_field,
                                       truncate_last=True)[:, idx_state],
                    zorder=2,
                    c="green",  # former: steelblue"
                    ls="--",
                    lw=1.0,
                    label="nom sim"
                    if cnt_step[idx_seg] == 0 else "",  # print once
                )

            axs[idx_state].set_ylabel(state_labels[idx_state])

        if plot_act:
            # Plot the actions
            for idx_act in range(dim_act):
                # Plot the real segments
                cnt_step = [0]
                for segment_gt in segments_ground_truth[idx_r]:
                    axs[dim_state + idx_act].plot(
                        np.arange(cnt_step[-1],
                                  cnt_step[-1] + segment_gt.length),
                        segment_gt.get_data_values(
                            "actions", truncate_last=False)[:, idx_act],
                        zorder=0,
                        c="black",
                        label="target"
                        if cnt_step[-1] == 0 else "",  # print once
                    )
                    cnt_step.append(cnt_step[-1] + segment_gt.length)

                # Plot the maximum likely simulated segments
                for idx_seg, sml in enumerate(segments_multiple_envs[idx_r]):
                    for idx_dp, sdp in enumerate(sml):
                        axs[dim_state + idx_act].plot(
                            np.arange(cnt_step[idx_seg],
                                      cnt_step[idx_seg] + sdp.length),
                            sdp.get_data_values("actions",
                                                truncate_last=False)[:,
                                                                     idx_act],
                            zorder=2
                            if idx_dp == 0 else 0,  # most likely on top
                            c=cmap_samples[idx_dp],
                            ls="--",
                            lw=1.5 if plot_type == "confidence" or idx_dp == 0
                            else 0.5,
                            alpha=1.0 if plot_type == "confidence"
                            or idx_dp == 0 else 0.4,
                            label=label_samples
                            if cnt_step[idx_seg] == idx_seg == idx_dp == 0 else
                            "",  # print once
                        )
                        if plot_type != "samples":
                            # Stop here, unless the rollouts of most likely all domain parameters should be plotted
                            break

                if plot_type == "confidence":
                    len_segs = len(segments_multiple_envs[idx_r][0][0])
                    assert check_all_lengths_equal(
                        segments_multiple_envs[idx_r]
                        [0])  # all segments need to be equally long

                    acts_all = []
                    for idx_dp in range(num_samples):
                        # Reconstruct the step sequences for all domain parameters
                        ss_dp = StepSequence.concat([
                            seg_dp[idx_dp] for seg_dp in [
                                segs_ro
                                for segs_ro in segments_multiple_envs[idx_r]
                            ]
                        ])
                        acts = ss_dp.get_data_values("actions",
                                                     truncate_last=False)
                        acts_all.append(acts)
                    acts_all = np.stack(acts_all, axis=0)
                    acts_mean = np.mean(acts_all, axis=0)
                    acts_std = np.std(acts_all, axis=0)

                    for idx_seg in range(len(segments_multiple_envs[idx_r])):
                        m_i = acts_mean[cnt_step[idx_seg]:cnt_step[idx_seg] +
                                        len_segs, idx_act]
                        s_i = acts_std[cnt_step[idx_seg]:cnt_step[idx_seg] +
                                       len_segs, idx_act]
                        draw_curve(
                            "std",
                            axs[dim_state + idx_act],
                            pd.DataFrame(dict(mean=m_i, std=s_i)),
                            x_grid=np.arange(cnt_step[idx_seg],
                                             cnt_step[idx_seg] + len_segs),
                            show_legend=False,
                            area_label="2 std" if idx_seg == 0 else None,
                            plot_kwargs=dict(color=cmap_samples[0]),
                        )

                # Plot the nominal simulation's segments
                for idx_seg, sn in enumerate(segments_nominal[idx_r]):
                    axs[dim_state + idx_act].plot(
                        np.arange(cnt_step[idx_seg],
                                  cnt_step[idx_seg] + sn.length),
                        sn.get_data_values("actions",
                                           truncate_last=False)[:, idx_act],
                        zorder=2,
                        c="steelblue",
                        ls="-.",
                        label="nom sim"
                        if cnt_step[idx_seg] == 0 else "",  # print once
                    )

                axs[dim_state + idx_act].set_ylabel(act_labels[idx_act])

        # Settings for all subplots
        for idx_axs in range(num_rows):
            if x_limits is not None:
                axs[idx_axs].set_xlim(x_limits[0], x_limits[1])

        # Set window title and the legend, placing the latter above the plot expanding and expanding it fully
        use_rec_str = ", using rec actions" if use_rec_str else ""
        round_str = f"round {idx_round}, " if idx_round != -1 else ""
        fig.canvas.manager.set_window_title(
            f"Target Domain and Simulated Rollouts (iteration {idx_iter}, {round_str}rollout {idx_r}{use_rec_str})"
        )
        lg = axs[0].legend(
            ncol=2 + num_samples,
            bbox_to_anchor=(0.0, 1.02, 1.0, 0.102),
            loc="lower left",
            mode="expand",
            borderaxespad=0.0,
        )

        # Save if desired
        if save_dir is not None:
            for fmt in file_format:
                os.makedirs(os.path.join(save_dir, "plots"), exist_ok=True)
                len_seg_str = f"seglen_{segments_ground_truth[0][0].length}"
                use_rec_str = "_use_rec" if use_rec_str else ""
                round_str = f"_round_{idx_round}" if idx_round != -1 else ""
                fig.savefig(
                    os.path.join(
                        save_dir,
                        "plots",
                        f"posterior_iter_{idx_iter}{round_str}_rollout_{idx_r}_{len_seg_str}{use_rec_str}.{fmt}",
                    ),
                    bbox_extra_artists=(lg, ),
                    dpi=150,
                )

        # Append current figure
        fig_list.append(fig)

    return fig_list
Beispiel #13
0
    def update(self, rollouts: Sequence[StepSequence], use_empirical_returns: bool = False):
        """
        Adapt the parameters of the advantage function estimator, minimizing the MSE loss for the given samples.

        :param rollouts: batch of rollouts
        :param use_empirical_returns: use the return from the rollout (True) or the ones from the V-fcn (False)
        :return adv: tensor of advantages after V-function updates
        """
        # Turn the batch of rollouts into a list of steps
        concat_ros = StepSequence.concat(rollouts)
        concat_ros.torch(data_type=to.get_default_dtype())

        if use_empirical_returns:
            # Compute the value targets (empirical discounted returns) for all samples
            v_targ = discounted_values(rollouts, self.gamma).view(-1, 1)
        else:
            # Use the value function to compute the value targets (also called bootstrapping)
            v_targ = self.tdlamda_returns(concat_ros=concat_ros)
        concat_ros.add_data('v_targ', v_targ)

        # Logging
        with to.no_grad():
            v_pred_old = self.values(concat_ros)
            loss_old = self.loss_fcn(v_pred_old, v_targ)
        vfcn_grad_norm = []

        # Iterate over all gathered samples num_epoch times
        for e in range(self.num_epoch):

            for batch in tqdm(concat_ros.split_shuffled_batches(
                self.batch_size, complete_rollouts=isinstance(self.vfcn, RecurrentPolicy)),
                total=num_iter_from_rollouts(None, concat_ros, self.batch_size),
                desc=f'Epoch {e}', unit='batches', file=sys.stdout, leave=False):
                # Reset the gradients
                self.optim.zero_grad()

                # Make predictions for this mini-batch using values function
                v_pred = self.values(batch)

                # Compute estimator loss for this mini-batch and backpropagate
                vfcn_loss = self.loss_fcn(v_pred, batch.v_targ)
                vfcn_loss.backward()

                # Clip the gradients if desired
                vfcn_grad_norm.append(Algorithm.clip_grad(self.vfcn, self.max_grad_norm))

                # Call optimizer
                self.optim.step()

            # Update the learning rate if a scheduler has been specified
            if self._lr_scheduler is not None:
                self._lr_scheduler.step()

        # Estimate the advantage after fitting the parameters of the V-fcn
        adv = self.gae(concat_ros)  # is done with to.no_grad()

        with to.no_grad():
            v_pred_new = self.values(concat_ros)
            loss_new = self.loss_fcn(v_pred_new, v_targ)
            vfcn_loss_impr = loss_old - loss_new  # positive values are desired
            explvar = explained_var(v_pred_new, v_targ)  # values close to 1 are desired

        # Log metrics computed from the old value function (before the update)
        self.logger.add_value('explained var critic', explvar, 4)
        self.logger.add_value('loss improv critic', vfcn_loss_impr, 4)
        self.logger.add_value('avg grad norm critic', np.mean(vfcn_grad_norm), 4)
        if self._lr_scheduler is not None:
            self.logger.add_value('lr critic', self._lr_scheduler.get_last_lr(), 6)

        return adv
Beispiel #14
0
    def update(self, rollouts: Sequence[StepSequence]):
        # Turn the batch of rollouts into a list of steps
        concat_ros = StepSequence.concat(rollouts)
        concat_ros.torch(data_type=to.get_default_dtype())

        with to.no_grad():
            # Compute the action probabilities using the old (before update) policy
            act_stats = compute_action_statistics(concat_ros, self._expl_strat)
            log_probs_old = act_stats.log_probs
            act_distr_old = act_stats.act_distr

            # Compute value predictions using the old old (before update) value function
            v_pred_old = self._critic.values(concat_ros)

        # Attach advantages and old log probs to rollout
        concat_ros.add_data('log_probs_old', log_probs_old)
        concat_ros.add_data('v_pred_old', v_pred_old)

        # For logging the gradient norms
        policy_grad_norm = []
        value_fcn_grad_norm = []

        # Compute the value targets (empirical discounted returns) for all samples before fitting the V-fcn parameters
        adv = self._critic.gae(concat_ros)  # done with to.no_grad()
        v_targ = discounted_values(rollouts, self._critic.gamma).view(
            -1, 1)  # empirical discounted returns
        concat_ros.add_data('adv', adv)
        concat_ros.add_data('v_targ', v_targ)

        # Iterations over the whole data set
        for e in range(self.num_epoch):

            for batch in tqdm(concat_ros.split_shuffled_batches(
                    self.batch_size,
                    complete_rollouts=self._policy.is_recurrent
                    or isinstance(self._critic.value_fcn, RecurrentPolicy)),
                              total=num_iter_from_rollouts(
                                  None, concat_ros, self.batch_size),
                              desc=f'Epoch {e}',
                              unit='batches',
                              file=sys.stdout,
                              leave=False):
                # Reset the gradients
                self.optim.zero_grad()

                # Compute log of the action probabilities for the mini-batch
                log_probs = compute_action_statistics(
                    batch, self._expl_strat).log_probs.to(self.policy.device)

                # Compute value predictions for the mini-batch
                v_pred = self._critic.values(batch)

                # Compute combined loss and backpropagate
                loss = self.loss_fcn(log_probs, batch.log_probs_old, batch.adv,
                                     v_pred, batch.v_pred_old, batch.v_targ)
                loss.backward()

                # Clip the gradients if desired
                policy_grad_norm.append(
                    self.clip_grad(self._expl_strat.policy,
                                   self.max_grad_norm))
                value_fcn_grad_norm.append(
                    self.clip_grad(self._critic.value_fcn, self.max_grad_norm))

                # Call optimizer
                self.optim.step()

                if to.isnan(self._expl_strat.noise.std).any():
                    raise RuntimeError(
                        f'At least one exploration parameter became NaN! The exploration parameters are'
                        f'\n{self._expl_strat.std.detach().numpy()}')

            # Update the learning rate if a scheduler has been specified
            if self._lr_scheduler is not None:
                self._lr_scheduler.step()

        # Additional logging
        if self.log_loss:
            with to.no_grad():
                # Compute value predictions using the new (after the updates) value function approximator
                v_pred = self._critic.values(concat_ros).to(self.policy.device)
                v_loss_old = self._critic.loss_fcn(
                    v_pred_old.to(self.policy.device),
                    v_targ.to(self.policy.device)).to(self.policy.device)
                v_loss_new = self._critic.loss_fcn(v_pred, v_targ).to(
                    self.policy.device)
                value_fcn_loss_impr = v_loss_old - v_loss_new  # positive values are desired

                # Compute the action probabilities using the new (after the updates) policy
                act_stats = compute_action_statistics(concat_ros,
                                                      self._expl_strat)
                log_probs_new = act_stats.log_probs
                act_distr_new = act_stats.act_distr
                loss_after = self.loss_fcn(log_probs_new, log_probs_old, adv,
                                           v_pred, v_pred_old, v_targ)
                kl_avg = to.mean(kl_divergence(
                    act_distr_old,
                    act_distr_new))  # mean seeking a.k.a. inclusive KL

                # Compute explained variance (after the updates)
                explvar = explained_var(v_pred, v_targ)
                self.logger.add_value('explained var',
                                      explvar.detach().numpy())
                self.logger.add_value('V-fcn loss improvement',
                                      value_fcn_loss_impr.detach().numpy())
                self.logger.add_value('loss after',
                                      loss_after.detach().numpy())
                self.logger.add_value('KL(old_new)', kl_avg.item())

        # Logging
        self.logger.add_value(
            'avg expl strat std',
            to.mean(self._expl_strat.noise.std.data).detach().numpy())
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.noise.get_entropy().item())
        self.logger.add_value('avg policy grad norm',
                              np.mean(policy_grad_norm))
        self.logger.add_value('avg V-fcn grad norm',
                              np.mean(value_fcn_grad_norm))
        if self._lr_scheduler is not None:
            self.logger.add_value('learning rate', self._lr_scheduler.get_lr())
Beispiel #15
0
    def update(self, rollouts: Sequence[StepSequence]):
        # Turn the batch of rollouts into a list of steps
        concat_ros = StepSequence.concat(rollouts)
        concat_ros.torch(data_type=to.get_default_dtype())

        # Compute the value targets (empirical discounted returns) for all samples before fitting the V-fcn parameters
        adv = self._critic.gae(concat_ros)  # done with to.no_grad()
        v_targ = discounted_values(rollouts, self._critic.gamma).view(-1, 1).to(self.policy.device)  # empirical discounted returns

        with to.no_grad():
            # Compute value predictions and the GAE using the old (before the updates) value function approximator
            v_pred = self._critic.values(concat_ros)

            # Compute the action probabilities using the old (before update) policy
            act_stats = compute_action_statistics(concat_ros, self._expl_strat)
            log_probs_old = act_stats.log_probs
            act_distr_old = act_stats.act_distr
            loss_before = self.loss_fcn(log_probs_old, adv, v_pred, v_targ)
            self.logger.add_value('loss before', loss_before, 4)

        concat_ros.add_data('adv', adv)
        concat_ros.add_data('v_targ', v_targ)

        # For logging the gradients' norms
        policy_grad_norm = []

        for batch in tqdm(concat_ros.split_shuffled_batches(
            self.batch_size,
            complete_rollouts=self._policy.is_recurrent or isinstance(self._critic.vfcn, RecurrentPolicy)),
            total=num_iter_from_rollouts(None, concat_ros, self.batch_size),
            desc='Updating', unit='batches', file=sys.stdout, leave=False):
            # Reset the gradients
            self.optim.zero_grad()

            # Compute log of the action probabilities for the mini-batch
            log_probs = compute_action_statistics(batch, self._expl_strat).log_probs

            # Compute value predictions for the mini-batch
            v_pred = self._critic.values(batch)

            # Compute combined loss and backpropagate
            loss = self.loss_fcn(log_probs, batch.adv, v_pred, batch.v_targ)
            loss.backward()

            # Clip the gradients if desired
            policy_grad_norm.append(self.clip_grad(self.expl_strat.policy, self.max_grad_norm))

            # Call optimizer
            self.optim.step()

        # Update the learning rate if a scheduler has been specified
        if self._lr_scheduler is not None:
            self._lr_scheduler.step()

        if to.isnan(self.expl_strat.noise.std).any():
            raise RuntimeError(f'At least one exploration parameter became NaN! The exploration parameters are'
                               f'\n{self.expl_strat.std.item()}')

        # Logging
        with to.no_grad():
            # Compute value predictions and the GAE using the new (after the updates) value function approximator
            v_pred = self._critic.values(concat_ros).to(self.policy.device)
            adv = self._critic.gae(concat_ros)  # done with to.no_grad()

            # Compute the action probabilities using the new (after the updates) policy
            act_stats = compute_action_statistics(concat_ros, self._expl_strat)
            log_probs_new = act_stats.log_probs
            act_distr_new = act_stats.act_distr
            loss_after = self.loss_fcn(log_probs_new, adv, v_pred, v_targ)
            kl_avg = to.mean(
                kl_divergence(act_distr_old, act_distr_new))  # mean seeking a.k.a. inclusive KL
            explvar = explained_var(v_pred, v_targ)  # values close to 1 are desired
            self.logger.add_value('loss after', loss_after, 4)
            self.logger.add_value('KL(old_new)', kl_avg, 4)
            self.logger.add_value('explained var', explvar, 4)

        ent = self.expl_strat.noise.get_entropy()
        self.logger.add_value('avg expl strat std', to.mean(self.expl_strat.noise.std), 4)
        self.logger.add_value('expl strat entropy', to.mean(ent), 4)
        self.logger.add_value('avg grad norm policy', np.mean(policy_grad_norm), 4)
        if self._lr_scheduler is not None:
            self.logger.add_value('avg lr', np.mean(self._lr_scheduler.get_last_lr()), 6)
Beispiel #16
0
    def update(self, rollouts: Sequence[StepSequence]):
        # Turn the batch of rollouts into a list of steps
        concat_ros = StepSequence.concat(rollouts)
        concat_ros.torch(data_type=to.get_default_dtype())

        # Update the advantage estimator's parameters and return advantage estimates
        adv = self._critic.update(rollouts, use_empirical_returns=False)

        with to.no_grad():
            # Compute the action probabilities using the old (before update) policy
            act_stats = compute_action_statistics(concat_ros, self._expl_strat)
            log_probs_old = act_stats.log_probs
            act_distr_old = act_stats.act_distr

        # Attach advantages and old log probs to rollout
        concat_ros.add_data('adv', adv)
        concat_ros.add_data('log_probs_old', log_probs_old)

        # For logging the gradient norms
        policy_grad_norm = []

        # Iterations over the whole data set
        for e in range(self.num_epoch):

            for batch in tqdm(concat_ros.split_shuffled_batches(
                    self.batch_size,
                    complete_rollouts=self._policy.is_recurrent),
                              total=num_iter_from_rollouts(
                                  None, concat_ros, self.batch_size),
                              desc=f'Epoch {e}',
                              unit='batches',
                              file=sys.stdout,
                              leave=False):
                # Reset the gradients
                self.optim.zero_grad()

                # Compute log of the action probabilities for the mini-batch
                log_probs = compute_action_statistics(
                    batch, self._expl_strat).log_probs

                # Compute policy loss and backpropagate
                loss = self.loss_fcn(log_probs, batch.log_probs_old, batch.adv)
                loss.backward()

                # Clip the gradients if desired
                policy_grad_norm.append(
                    self.clip_grad(self._expl_strat.policy,
                                   self.max_grad_norm))

                # Call optimizer
                self.optim.step()

                if to.isnan(self._expl_strat.noise.std).any():
                    raise RuntimeError(
                        f'At least one exploration parameter became NaN! The exploration parameters are'
                        f'\n{self._expl_strat.std.detach().numpy()}')

            # Update the learning rate if a scheduler has been specified
            if self._lr_scheduler is not None:
                self._lr_scheduler.step()

        # Additional logging
        if self.log_loss:
            with to.no_grad():
                act_stats = compute_action_statistics(concat_ros,
                                                      self._expl_strat)
                log_probs_new = act_stats.log_probs
                act_distr_new = act_stats.act_distr
                loss_after = self.loss_fcn(log_probs_new, log_probs_old, adv)
                kl_avg = to.mean(kl_divergence(
                    act_distr_old,
                    act_distr_new))  # mean seeking a.k.a. inclusive KL
                self.logger.add_value('loss after',
                                      loss_after.detach().numpy())
                self.logger.add_value('KL(old_new)', kl_avg.item())

        # Logging
        self.logger.add_value(
            'avg expl strat std',
            to.mean(self._expl_strat.noise.std.data).detach().numpy())
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.noise.get_entropy().item())
        self.logger.add_value('avg policy grad norm',
                              np.mean(policy_grad_norm))
        if self._lr_scheduler is not None:
            self.logger.add_value('learning rate', self._lr_scheduler.get_lr())
Beispiel #17
0
def experiment_w_distruber(env_real: RealEnv, env_sim: SimEnv):
    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt('Running the evaluation policy ...', 'c', bright=True)
    ro1 = rollout(env_real,
                  policy,
                  eval=True,
                  max_steps=args.max_steps // 3,
                  render_mode=RenderMode(),
                  no_reset=True,
                  no_close=True)

    # Run disturber
    env_real = inner_env(env_real)  # since we are reusing it
    print_cbt('Running the 1st disturber ...', 'c', bright=True)
    rollout(env_real,
            disturber_pos,
            eval=True,
            max_steps=steps_disturb,
            render_mode=RenderMode(),
            no_reset=True,
            no_close=True)

    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt('Running the evaluation policy ...', 'c', bright=True)
    ro2 = rollout(env_real,
                  policy,
                  eval=True,
                  max_steps=args.max_steps // 3,
                  render_mode=RenderMode(),
                  no_reset=True,
                  no_close=True)

    # Run disturber
    env_real = inner_env(env_real)  # since we are reusing it
    print_cbt('Running the 2nd disturber ...', 'c', bright=True)
    rollout(env_real,
            disturber_neg,
            eval=True,
            max_steps=steps_disturb,
            render_mode=RenderMode(),
            no_reset=True,
            no_close=True)

    # Wrap the environment in the same as done during training
    env_real = wrap_like_other_env(env_real, env_sim)

    # Run learned policy on the device
    print_cbt('Running the evaluation policy ...', 'c', bright=True)
    ro3 = rollout(env_real,
                  policy,
                  eval=True,
                  max_steps=args.max_steps // 3,
                  render_mode=RenderMode(),
                  no_reset=True,
                  no_close=True)

    return StepSequence.concat([ro1, ro2, ro3])
Beispiel #18
0
    def step(self, snapshot_mode: str, meta_info: dict = None, parallel: bool = True):
        rand_trajs = []
        ref_trajs = []
        ros = []
        visited = []
        for i in range(self.svpg.num_particles):
            done = False
            svpg_env = self.svpg_wrapper
            state = svpg_env.reset()
            states = []
            actions = []
            rewards = []
            infos = []
            rand_trajs_now = []
            ref_trajs_now = []
            if parallel:
                with to.no_grad():
                    for t in range(10):
                        action = self.svpg.expl_strats[i](
                            to.as_tensor(state, dtype=to.get_default_dtype())).detach().numpy()
                        state = svpg_env.lite_step(action)
                        states.append(state)
                        actions.append(action)
                    visited.append(states)
                    rewards, rand_trajs_now, ref_trajs_now = svpg_env.eval_states(states)
                    rand_trajs += rand_trajs_now
                    ref_trajs += ref_trajs_now
                    ros.append(StepSequence(observations=states, actions=actions, rewards=rewards))
            else:
                with to.no_grad():
                    while not done:
                        action = self.svpg.expl_strats[i](
                            to.as_tensor(state, dtype=to.get_default_dtype())).detach().numpy()
                        state, reward, done, info = svpg_env.step(action)
                        print(self.params.array_to_dict(state), ' => ', reward)
                        states.append(state)
                        rewards.append(reward)
                        actions.append(action)
                        infos.append(info)
                        rand_trajs += info['rand']
                        ref_trajs += info['ref']
                    ros.append(StepSequence(observations=states, actions=actions, rewards=rewards))
            self.logger.add_value(f'SVPG_agent_{i}_mean_reward', np.mean(rewards))
            ros[i].torch(data_type=to.DoubleTensor)
            for rt in rand_trajs_now:
                rt.torch(data_type=to.double)
                rt.observations = rt.observations.double().detach()
                rt.actions = rt.actions.double().detach()
            self.subroutine.update(rand_trajs_now)

        # Logging
        rets = [ro.undiscounted_return() for ro in rand_trajs]
        ret_avg = np.mean(rets)
        ret_med = np.median(rets)
        ret_std = np.std(rets)
        self.logger.add_value('num rollouts', len(rand_trajs))
        self.logger.add_value('avg rollout len', np.mean([ro.length for ro in rand_trajs]))
        self.logger.add_value('avg return', ret_avg)
        self.logger.add_value('median return', ret_med)
        self.logger.add_value('std return', ret_std)

        # Flatten and combine all randomized and reference trajectories for discriminator
        flattened_randomized = StepSequence.concat(rand_trajs)
        flattened_randomized.torch(data_type=to.double)
        flattened_reference = StepSequence.concat(ref_trajs)
        flattened_reference.torch(data_type=to.double)
        self.reward_generator.train(flattened_reference, flattened_randomized, self.num_discriminator_epoch)

        if self.curr_time_step > self.warm_up_time:
            # Update the particles
            # List of lists to comply with interface
            self.svpg.update(list(map(lambda x: [x], ros)))
        flattened_randomized.torch(data_type=to.double)
        flattened_randomized.observations = flattened_randomized.observations.double().detach()
        flattened_randomized.actions = flattened_randomized.actions.double().detach()

        # np.save(f'{self.save_dir}actions{self.curr_iter}', flattened_randomized.actions)
        self.make_snapshot(snapshot_mode, float(ret_avg), meta_info)
        self.subroutine.make_snapshot(snapshot_mode='best', curr_avg_ret=float(ret_avg))
        self.curr_time_step += 1
            )
            segment_nom.append(sn)
            if args.use_rec:
                check_act_equal(segment_real, sn, check_applied=hasattr(sn, "actions_applied"))

            # Pad if necessary
            StepSequence.pad(sn, segment_real.length)

        # Append individual segments
        segments_nom.append(segment_nom)
    assert len(segments_nom) == len(segments_ml_all)

    # Get the states for computing the performance metrics
    states_real = np.stack([ro.get_data_values(args.data_type, truncate_last=True) for ro in rollouts_real], axis=0)
    states_nom = np.stack(
        [StepSequence.concat(segs_nom).get_data_values(args.data_type) for segs_nom in segments_nom], axis=0
    )
    states_ml = np.stack(  # index 0 ist the most likely
        [
            StepSequence.concat([s[0] for s in [segs_ml for segs_ml in segments_ml]]).get_data_values(args.data_type)
            for segments_ml in segments_ml_all
        ],
        axis=0,
    )
    assert states_real.shape == states_nom.shape == states_ml.shape
    assert states_real.shape[0] == num_rollouts_real

    # Compute the DTW and RMSE distance and store it in a table
    compute_traj_distance_metrics(states_real, states_ml, states_nom, num_rollouts_real)

    # Optionally masks out some states/observations and actions for plotting