Beispiel #1
0
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(
            obs_unnormalized, self.obs_mean_pl, self.obs_std_pl
        )  # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl
        acs_normalized = normalize(
            acs_unnormalized, self.acs_mean_pl, self.acs_std_pl
        )  # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized],
                                       axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        self.delta_pred_normalized = build_mlp(concatenated_input, \
                                        self.ob_dim, \
                                        self.scope, \
                                        self.n_layers, \
                                        self.size) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state)
        self.delta_pred_unnormalized = unnormalize(
            self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl
        )  # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl
        self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized  # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
Beispiel #2
0
    def estimate_advantage(self, obs, q_values):

        """
        Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # Estimate the advantage when nn_baseline is True,
        # by querying the neural network that you're using to learn the baseline
        if self.nn_baseline:
            baselines_normalized = self.actor.run_baseline_prediction(obs)
            ## ensure that the baseline and q_values have the same dimensionality
            ## to prevent silent broadcasting errors
            assert baselines_normalized.ndim == q_values.ndim
            ## baseline was trained with standardized q_values, so ensure that the predictions
            ## have the same mean and standard deviation as the current batch of q_values
            baselines = utils.unnormalize(
                baselines_normalized, np.mean(q_values), np.std(q_values)
            )
            ## TODO: compute advantage estimates using q_values and baselines
            advantages = q_values - baselines

        # Else, just set the advantage to [Q]
        else:
            advantages = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            ## TODO: standardize the advantages to have a mean of zero
            ## and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            advantages = utils.normalize(
                advantages, np.mean(advantages), np.std(advantages)
            )

        return advantages
Beispiel #3
0
    def _get_next_obs_prediction(self, observations, actions, data_statistics):

        delta_pred_unnormalized = unnormalize(
            self._forward_delta_pred_normalized(observations, actions,
                                                data_statistics),
            data_statistics['delta_mean'], data_statistics['delta_std']
        )  # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl

        return observations + delta_pred_unnormalized  # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
    def forward(
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        obs_unnormalized = ptu.from_numpy(obs_unnormalized)
        acs_unnormalized = ptu.from_numpy(acs_unnormalized)
        self.update_statistics(obs_mean, obs_std, acs_mean, acs_std,
                               delta_mean, delta_std)

        #testing (512,4)
        # obs_test = obs_unnormalized.reshape((self.obs_mean.shape[0]))
        # obs_test_normalized = (obs_test - self.obs_mean) / self.obs_std
        # tmp_mean = torch.mean(obs_test_normalized)
        # tmp_std = torch.std(obs_test_normalized)

        # normalize input data to mean 0, std 1
        obs_normalized = normalize(obs_unnormalized, self.obs_mean,
                                   self.obs_std)
        acs_normalized = normalize(acs_unnormalized, self.acs_mean,
                                   self.acs_std)

        # predicted change in obs
        # concatenated_input = torch.cat([obs_normalized.expand(acs_normalized.shape[0], -1), acs_normalized], dim=1)
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        # TODO(Q1) compute delta_pred_normalized and next_obs_pred
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        delta_pred_normalized = self.delta_network(
            concatenated_input)  # TODO(Q1)
        delta_pred = unnormalize(delta_pred_normalized, self.delta_mean,
                                 self.delta_std)
        next_obs_pred = obs_unnormalized + delta_pred  # TODO(Q1)
        return next_obs_pred, delta_pred_normalized
Beispiel #5
0
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl)
        acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl)

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size)
        self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl)
        self.next_obs_pred = self.obs_pl + self.delta_pred_unnormalized
    def forward(
            self,
            obs_unnormalized,
            acs_unnormalized,
            obs_mean,
            obs_std,
            acs_mean,
            acs_std,
            delta_mean,
            delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        # convert to tensors
        obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std = self.update_statistics(obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std)

        obs_unnormalized = ptu.from_numpy(obs_unnormalized)
        acs_unnormalized= ptu.from_numpy(acs_unnormalized)
        
        # normalize input data to mean 0, std 1
        obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)# TODO(Q1)
        acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)# TODO(Q1)
        
        

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        
        
        # TODO(Q1) compute delta_pred_normalized and next_obs_pred
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        delta_pred_normalized = self.delta_network(concatenated_input)# TODO(Q1)
        next_obs_pred = obs_unnormalized + unnormalize(delta_pred_normalized, delta_mean, delta_std)# TODO(Q1)
        return next_obs_pred, delta_pred_normalized
Beispiel #7
0
    def get_prediction(self, obs, acs, data_statistics):
        if len(obs.shape) == 1 or len(acs.shape) == 1:
            obs = np.squeeze(obs)[None]
            acs = np.squeeze(acs)[None]

        norm_obs = normalize(obs, data_statistics['obs_mean'],
                             data_statistics['obs_std'])
        norm_acs = normalize(acs, data_statistics['acs_mean'],
                             data_statistics['acs_std'])

        norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs),
                                                 axis=1)).to(self.device)
        norm_delta = self.delta_func(norm_input).cpu().detach().numpy()

        delta = unnormalize(norm_delta, data_statistics['delta_mean'],
                            data_statistics['delta_std'])
        return obs + delta
Beispiel #8
0
    def forward(  # input and output are both tensors
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
    :param obs_unnormalized: Unnormalized observations
    :param acs_unnormalized: Unnormalized actions
    :param obs_mean: Mean of observations
    :param obs_std: Standard deviation of observations
    :param acs_mean: Mean of actions
    :param acs_std: Standard deviation of actions
    :param delta_mean: Mean of state difference `s_t+1 - s_t`.
    :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
    :return: tuple `(next_obs_pred, delta_pred_normalized)`
    This forward function should return a tuple of two items
        1. `next_obs_pred` which is the predicted `s_t+1`
        2. `delta_pred_normalized` which is the normalized (i.e. not
            unnormalized) output of the delta network. This is needed
    """

        obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)
        acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        # TODO(Q1) done compute delta_pred_normalized and next_obs_pred
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        delta_pred_normalized = self.delta_network(concatenated_input)

        next_obs_pred = unnormalize(delta_pred_normalized, delta_mean,
                                    delta_std) + obs_unnormalized

        return next_obs_pred, delta_pred_normalized
    def get_prediction(self, obs, acs, data_statistics):
        if len(obs.shape) == 1 or len(acs.shape) == 1:
            obs = np.squeeze(obs)[None]
            acs = np.squeeze(acs)[None]

        # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics
        norm_obs = normalize(obs, data_statistics['obs_mean'],
                             data_statistics['obs_std'])
        norm_acs = normalize(acs, data_statistics['acs_mean'],
                             data_statistics['acs_std'])

        norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs),
                                                 axis=1)).to(self.device)
        norm_delta = self.delta_func(norm_input).cpu().detach().numpy()

        # DoneTODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics
        delta = unnormalize(norm_delta, data_statistics['delta_mean'],
                            data_statistics['delta_std'])
        # DoneTODO(Q1) Return the predited next observation (You will use obs and delta)
        return obs + delta
Beispiel #10
0
    def forward(
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        # normalize input data to mean 0, std 1
        obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)
        acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        delta_pred_normalized = self.delta_network(concatenated_input)
        next_obs_pred = unnormalize(delta_pred_normalized, delta_mean,
                                    delta_std) + obs_unnormalized
        return next_obs_pred, delta_pred_normalized