Example #1
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics (same as above)
        norm_obs = normalize(np.squeeze(observations),
                             data_statistics['obs_mean'],
                             data_statistics['obs_std'])
        norm_acs = normalize(np.squeeze(actions), data_statistics['acs_mean'],
                             data_statistics['acs_std'])

        pred_delta = self.delta_func(
            torch.Tensor(np.concatenate((norm_obs, norm_acs),
                                        axis=1)).to(self.device))
        # DoneTODO(Q1) Define a normalized true_delta using observations, next_observations and the delta stats from data_statistics
        true_delta = torch.Tensor(
            normalize(next_observations - observations,
                      data_statistics['delta_mean'],
                      data_statistics['delta_std'])).to(self.device)

        # DoneTODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and true change in state
        loss = nn.functional.mse_loss(true_delta, pred_delta)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
Example #2
0
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(
            obs_unnormalized, self.obs_mean_pl, self.obs_std_pl
        )  # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl
        acs_normalized = normalize(
            acs_unnormalized, self.acs_mean_pl, self.acs_std_pl
        )  # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized],
                                       axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        self.delta_pred_normalized = build_mlp(concatenated_input, \
                                        self.ob_dim, \
                                        self.scope, \
                                        self.n_layers, \
                                        self.size) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state)
        self.delta_pred_unnormalized = unnormalize(
            self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl
        )  # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl
        self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized  # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        action_dist = self.forward(observations)
        log_pi = action_dist.log_prob(actions)
        print(observations.shape)
        loss = -torch.sum(log_pi * advantages)

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            if q_values is None:
                targets = utils.normalize(advantages, np.mean(q_values),
                                          np.std(q_values))
            else:
                targets = utils.normalize(q_values, np.mean(q_values),
                                          np.std(q_values))
            targets = ptu.from_numpy(targets)

            ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = self.baseline.forward(observations).squeeze(
                1)

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape, f"shapes do not match, pred_shape: " \
                                                                f" {baseline_predictions.shape} \t target shape {targets.shape}"

            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = F.mse_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
    def forward(
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        obs_unnormalized = ptu.from_numpy(obs_unnormalized)
        acs_unnormalized = ptu.from_numpy(acs_unnormalized)
        self.update_statistics(obs_mean, obs_std, acs_mean, acs_std,
                               delta_mean, delta_std)

        #testing (512,4)
        # obs_test = obs_unnormalized.reshape((self.obs_mean.shape[0]))
        # obs_test_normalized = (obs_test - self.obs_mean) / self.obs_std
        # tmp_mean = torch.mean(obs_test_normalized)
        # tmp_std = torch.std(obs_test_normalized)

        # normalize input data to mean 0, std 1
        obs_normalized = normalize(obs_unnormalized, self.obs_mean,
                                   self.obs_std)
        acs_normalized = normalize(acs_unnormalized, self.acs_mean,
                                   self.acs_std)

        # predicted change in obs
        # concatenated_input = torch.cat([obs_normalized.expand(acs_normalized.shape[0], -1), acs_normalized], dim=1)
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        # TODO(Q1) compute delta_pred_normalized and next_obs_pred
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        delta_pred_normalized = self.delta_network(
            concatenated_input)  # TODO(Q1)
        delta_pred = unnormalize(delta_pred_normalized, self.delta_mean,
                                 self.delta_std)
        next_obs_pred = obs_unnormalized + delta_pred  # TODO(Q1)
        return next_obs_pred, delta_pred_normalized
Example #5
0
    def _forward_delta_pred_normalized(self, observations, actions,
                                       data_statistics):
        obs_normalized = normalize(
            observations, data_statistics['obs_mean'],
            data_statistics['obs_std']
        )  # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl
        acs_normalized = normalize(
            actions, data_statistics['acs_mean'], data_statistics['acs_std']
        )  # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl

        mlp_input = torch.cat([obs_normalized, acs_normalized], dim=1)
        return self.delta_pred_normalized(mlp_input)
Example #6
0
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl)
        acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl)

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size)
        self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl)
        self.next_obs_pred = self.obs_pl + self.delta_pred_unnormalized
Example #7
0
    def forward(
            self,
            obs_unnormalized,
            acs_unnormalized,
            obs_mean,
            obs_std,
            acs_mean,
            acs_std,
            delta_mean,
            delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        # convert to tensors
        obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std = self.update_statistics(obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std)

        obs_unnormalized = ptu.from_numpy(obs_unnormalized)
        acs_unnormalized= ptu.from_numpy(acs_unnormalized)
        
        # normalize input data to mean 0, std 1
        obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)# TODO(Q1)
        acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)# TODO(Q1)
        
        

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        
        
        # TODO(Q1) compute delta_pred_normalized and next_obs_pred
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        delta_pred_normalized = self.delta_network(concatenated_input)# TODO(Q1)
        next_obs_pred = obs_unnormalized + unnormalize(delta_pred_normalized, delta_mean, delta_std)# TODO(Q1)
        return next_obs_pred, delta_pred_normalized
Example #8
0
    def update(self, observations, actions, next_observations, data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        # TODO(Q1) compute the normalized target for the model.
        target = normalize(next_observations-observations, data_statistics['delta_mean'], data_statistics['delta_std'])
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.

        pred, pred_normalized = self(observations, actions, **data_statistics)
        loss = self.loss(pred_normalized, ptu.from_numpy(target)) # TODO(Q1) compute the loss
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
Example #9
0
    def update(self, observations, actions, next_observations,
               data_statistics):

        observations, actions, next_observations = observations.to(
            self.device), actions.to(self.device), next_observations.to(
                self.device)
        # normalize the labels
        delta_labels = next_observations - observations
        delta_labels_normalized = normalize(
            delta_labels, data_statistics['delta_mean'],
            data_statistics['delta_std']
        )  # TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl

        delta_pred_normalized = self._forward_delta_pred_normalized(
            observations, actions, data_statistics)

        # compared predicted deltas to labels (both should be normalized)
        loss = self.mse_criterion(
            delta_labels_normalized, delta_pred_normalized
        )  # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and ground truth change in state

        # train the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.detach().cpu()
Example #10
0
    def estimate_advantage(self, obs, q_values):
        """
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # Estimate the advantage when nn_baseline is True,
        # by querying the neural network that you're using to learn the baseline
        if self.nn_baseline:
            baselines_unnormalized = self.actor.run_baseline_prediction(obs)
            ## ensure that the baseline and q_values have the same dimensionality
            ## to prevent silent broadcasting errors
            assert baselines_unnormalized.ndim == q_values.ndim
            ## baseline was trained with standardized q_values, so ensure that the predictions
            ## have the same mean and standard deviation as the current batch of q_values
            baselines = baselines_unnormalized * np.std(q_values) + np.mean(
                q_values)
            advantages = q_values - baselines

        # Else, just set the advantage to [Q]
        else:
            advantages = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            ## and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            advantages = normalize(advantages, advantages.mean(),
                                   advantages.std())

        return advantages
Example #11
0
    def define_train_op(self):

        # normalize the labels
        self.delta_labels_normalized =  normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl)

        # compared predicted deltas to labels (both should be normalized)
        self.loss = tf.losses.mean_squared_error(self.delta_labels_normalized, self.delta_pred_normalized)
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
Example #12
0
    def get_prediction(self, obs, acs, data_statistics):
        if len(obs.shape) == 1 or len(acs.shape) == 1:
            obs = np.squeeze(obs)[None]
            acs = np.squeeze(acs)[None]

        norm_obs = normalize(obs, data_statistics['obs_mean'],
                             data_statistics['obs_std'])
        norm_acs = normalize(acs, data_statistics['acs_mean'],
                             data_statistics['acs_std'])

        norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs),
                                                 axis=1)).to(self.device)
        norm_delta = self.delta_func(norm_input).cpu().detach().numpy()

        delta = unnormalize(norm_delta, data_statistics['delta_mean'],
                            data_statistics['delta_std'])
        return obs + delta
Example #13
0
    def define_train_op(self):

        # normalize the labels
        self.delta_labels_normalized =  normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl)# TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl

        # compared predicted deltas to labels (both should be normalized)
        self.loss = tf.losses.mean_squared_error(self.delta_labels_normalized, self.delta_pred_normalized)# TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and ground truth change in state
        self.train_op = tf.train.AdamOptimizer(learning_rate= self.learning_rate).minimize(self.loss) # TODO(Q1) Define a train_op to minimize the loss defined above. Adam optimizer will work well.
Example #14
0
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient √
        # HINT1: Recall that the expression that we want to MAXIMIZE
            # is the expectation over collected trajectories of:
            # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
            # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss
        
        # print('observations: ', observations)
        distribution = self.forward(observations)
        log_distribution: torch.Tensor = distribution.log_prob(actions)
        if not self.discrete:
            log_distribution = log_distribution.sum(1) # to fix the dimension problem
        assert log_distribution.size() == advantages.size()
        loss = - (log_distribution * advantages).sum()
       
        # TODO: optimize `loss` using `self.optimizer` √
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline and q_values is not None:
            ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one √
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            targets = utils.normalize(q_values, q_values.mean(), q_values.std())
            targets = ptu.from_numpy(targets)

            ## TODO: use the `forward` method of `self.baseline` to get baseline predictions √
            baseline_predictions: torch.Tensor = self.baseline(observations).squeeze()
            
            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape
            
            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = F.mse_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
            'Baseline Loss': ptu.to_numpy(baseline_loss),
        }
        # train_log['Baseline Loss'] = ptu.to_numpy(baseline_loss)
        return train_log
Example #15
0
    def update(self, observations, actions, advantages, q_values=None):
        # Not necessarily to conver to tensor type
        observations = tf.constant(observations, dtype=tf.float32)
        actions = tf.constant(actions, dtype=tf.float32)
        advantages = tf.constant(advantages, dtype=tf.float32)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(self.policy_params)
            pi = self.forward(observations)
            logp = pi.log_prob(actions)
            loss = -tf.reduce_mean(logp * advantages)
        gradients = tape.gradient(loss, self.policy_params)
        self.optimizer.apply_gradients(zip(gradients, self.policy_params))

        if self.nn_baseline:
            with tf.GradientTape() as tape:
                ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
                ## HINT: there is a `normalize` function in `infrastructure.utils`
                targets = normalize(q_values, np.mean(q_values),
                                    np.std(q_values))
                #targets = tf.Tensor(targets, dtype=tf.float32)

                ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
                baseline_predictions = self.baseline(observations)
                baseline_predictions = tf.squeeze(
                    baseline_predictions)  # Remove dimensions of size 1

                ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
                ## [ N ] versus shape [ N x 1 ]
                ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
                assert baseline_predictions.shape == targets.shape

                # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
                # HINT: use `F.mse_loss`
                baseline_loss = 0.5 * tf.keras.losses.mean_squared_error(
                    baseline_predictions, targets)

                # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
                # HINT: remember to `zero_grad` first
            gradients = tape.gradient(baseline_loss,
                                      self.baseline.trainable_variables)
            self.baseline_optimizer.apply_gradients(
                zip(gradients, self.baseline.trainable_variables))

        train_log = {
            'Training Loss': -loss.numpy(),
        }
        return train_log
Example #16
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
    :param observations: numpy array of observations
    :param actions: numpy array of actions
    :param next_observations: numpy array of next observations
    :param data_statistics: A dictionary with the following keys (each with
    a numpy array as the value):
         - 'obs_mean'
         - 'obs_std'
         - 'acs_mean'
         - 'acs_std'
         - 'delta_mean'
         - 'delta_std'
    :return:
    """
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        next_observations = ptu.from_numpy(next_observations)
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.

        self.update_statistics(*list(data_statistics.values()))
        # is it really needed??
        # --seems not needed, this updates ff_model's data statistics, whilst statistics is already updated in mb_agent & MPC_Policy
        # experiment result shows:
        #  ls -lh /git/py.code/hw4/homework_fall2020/hw4/cs285/scripts/../../data/hw4_q3_obstacles_obstacles-cs285-v0*
        # two curves are identical
        # so not needed
        data_statistics = {
            k: ptu.from_numpy(v)
            for k, v in data_statistics.items()
        }

        next_obs_pred, delta_pred_normalized = \
          self.forward(observations, actions, data_statistics['obs_mean'],
                       data_statistics['obs_std'], data_statistics['acs_mean'],
                       data_statistics['acs_std'], data_statistics['delta_mean'],
                       data_statistics['delta_std'])

        # TODO(Q1) done compute the normalized target for the model.
        target = normalize(next_observations - observations,
                           data_statistics['delta_mean'],
                           data_statistics['delta_std'])

        loss = self.loss(target, delta_pred_normalized)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
Example #17
0
    def forward(  # input and output are both tensors
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
    :param obs_unnormalized: Unnormalized observations
    :param acs_unnormalized: Unnormalized actions
    :param obs_mean: Mean of observations
    :param obs_std: Standard deviation of observations
    :param acs_mean: Mean of actions
    :param acs_std: Standard deviation of actions
    :param delta_mean: Mean of state difference `s_t+1 - s_t`.
    :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
    :return: tuple `(next_obs_pred, delta_pred_normalized)`
    This forward function should return a tuple of two items
        1. `next_obs_pred` which is the predicted `s_t+1`
        2. `delta_pred_normalized` which is the normalized (i.e. not
            unnormalized) output of the delta network. This is needed
    """

        obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)
        acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        # TODO(Q1) done compute delta_pred_normalized and next_obs_pred
        # Hint: as described in the PDF, the output of the network is the
        # *normalized change* in state, i.e. normalized(s_t+1 - s_t).
        delta_pred_normalized = self.delta_network(concatenated_input)

        next_obs_pred = unnormalize(delta_pred_normalized, delta_mean,
                                    delta_std) + obs_unnormalized

        return next_obs_pred, delta_pred_normalized
Example #18
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        obs = ptu.from_numpy(observations)
        acs = ptu.from_numpy(actions)
        next_obs = ptu.from_numpy(next_observations)
        obs_mean = ptu.from_numpy(data_statistics["obs_mean"])
        obs_std = ptu.from_numpy(data_statistics["obs_std"])
        acs_mean = ptu.from_numpy(data_statistics["acs_mean"])
        acs_std = ptu.from_numpy(data_statistics["acs_std"])
        delta_mean = ptu.from_numpy(data_statistics["delta_mean"])
        delta_std = ptu.from_numpy(data_statistics["delta_std"])

        # compute the normalized target for the model.
        delta_target_unnormalized = next_obs - obs
        delta_target_normalized = normalize(delta_target_unnormalized,
                                            delta_mean, delta_std)
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.

        # compute the loss
        _, delta_pred_normalized = self(
            obs,
            acs,
            obs_mean,
            obs_std,
            acs_mean,
            acs_std,
            delta_mean,
            delta_std,
        )
        loss = self.loss(delta_target_normalized, delta_pred_normalized)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            "Training Loss": ptu.to_numpy(loss),
        }
Example #19
0
    def get_prediction(self, obs, acs, data_statistics):
        if len(obs.shape) == 1 or len(acs.shape) == 1:
            obs = np.squeeze(obs)[None]
            acs = np.squeeze(acs)[None]

        # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics
        norm_obs = normalize(obs, data_statistics['obs_mean'],
                             data_statistics['obs_std'])
        norm_acs = normalize(acs, data_statistics['acs_mean'],
                             data_statistics['acs_std'])

        norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs),
                                                 axis=1)).to(self.device)
        norm_delta = self.delta_func(norm_input).cpu().detach().numpy()

        # DoneTODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics
        delta = unnormalize(norm_delta, data_statistics['delta_mean'],
                            data_statistics['delta_std'])
        # DoneTODO(Q1) Return the predited next observation (You will use obs and delta)
        return obs + delta
Example #20
0
    def forward(
        self,
        obs_unnormalized,
        acs_unnormalized,
        obs_mean,
        obs_std,
        acs_mean,
        acs_std,
        delta_mean,
        delta_std,
    ):
        """
        :param obs_unnormalized: Unnormalized observations
        :param acs_unnormalized: Unnormalized actions
        :param obs_mean: Mean of observations
        :param obs_std: Standard deviation of observations
        :param acs_mean: Mean of actions
        :param acs_std: Standard deviation of actions
        :param delta_mean: Mean of state difference `s_t+1 - s_t`.
        :param delta_std: Standard deviation of state difference `s_t+1 - s_t`.
        :return: tuple `(next_obs_pred, delta_pred_normalized)`
        This forward function should return a tuple of two items
            1. `next_obs_pred` which is the predicted `s_t+1`
            2. `delta_pred_normalized` which is the normalized (i.e. not
                unnormalized) output of the delta network. This is needed
        """
        # normalize input data to mean 0, std 1
        obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)
        acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)

        # predicted change in obs
        concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1)

        delta_pred_normalized = self.delta_network(concatenated_input)
        next_obs_pred = unnormalize(delta_pred_normalized, delta_mean,
                                    delta_std) + obs_unnormalized
        return next_obs_pred, delta_pred_normalized
Example #21
0
    def update(self, observations, actions, advantages, n_rollouts=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        if self.discrete:
            actions = actions.to(torch.int64)
            # logits: (batch_size, seq_len, action_dim)
            logits = self.forward(observations)
            # log_pi: (batch_size, seq_len)
            log_pi = logits.gather(dim=-1, index=actions.unsqueeze(
                dim=-1)).squeeze(dim=-1) - logits.logsumexp(dim=-1,
                                                            keepdim=False)
        else:
            acs_mean = self.forward(observations)
            # log_pi: (batch_size, seq_len, action_dim)
            log_pi = self.normal_dist.log_prob(
                normalize(data=actions,
                          mean=acs_mean,
                          std=torch.exp(self.logstd)))
            # log_pi: (batch_size, seq_len)
            log_pi = torch.sum(log_pi, dim=-1)

        assert log_pi.shape == advantages.shape
        loss = -torch.mean(torch.sum(log_pi * advantages, dim=-1), dim=0)
        if n_rollouts is not None and advantages.dim() == 1:
            # all rollouts are concatenated, manually divided by n_rollouts to get average
            log_pi /= n_rollouts

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
Example #22
0
    def update(self, observations, actions, next_observations, data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        next_observations = ptu.from_numpy(next_observations)
        obs_mean = ptu.from_numpy(data_statistics['obs_mean'])
        obs_std = ptu.from_numpy(data_statistics['obs_std'])
        acs_mean = ptu.from_numpy(data_statistics['acs_mean'])
        acs_std = ptu.from_numpy(data_statistics['acs_std'])
        delta_mean = ptu.from_numpy(data_statistics['delta_mean'])
        delta_std = ptu.from_numpy(data_statistics['delta_std'])

        target = normalize(next_observations - observations, delta_mean, delta_std)
        # Hint: you should use `data_statistics['delta_mean']` and
        # `data_statistics['delta_std']`, which keep track of the mean
        # and standard deviation of the model.

        prediction_delta = self.forward(observations, actions, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std)[1]
        loss = self.loss(prediction_delta, target)
        # Hint: `self(...)` returns a tuple, but you only need to use one of the
        # outputs.

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
Example #23
0
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # Maximize expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        loss = -torch.sum(
            self.forward(observations).log_prob(actions) * advantages)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## normalize the q_values to have a mean of zero and a standard deviation of one
            targets = utils.normalize(q_values, np.mean(q_values),
                                      np.std(q_values))
            targets = ptu.from_numpy(targets)

            ## use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = self.baseline.forward(observations)

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            baseline_predictions = torch.squeeze(baseline_predictions)
            assert baseline_predictions.shape == targets.shape

            # compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            baseline_loss = F.mse_loss(baseline_predictions, targets)

            self.optimizer.zero_grad()
            baseline_loss.backward()
            self.optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
Example #24
0
    def update(self, observations, actions, **kwargs):
        pi = self.forward(observations)
        advantages = kwargs['advantages'] if 'advantages' in kwargs else 1.0
        if self.discrete:
            log_prob = pi.log_prob(actions)
        else:
            log_prob = pi.log_prob(actions).sum(axis=-1)

        loss = torch.sum(-log_prob * advantages)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        train_log = {
            'Training pi Loss': ptu.to_numpy(loss),
        }

        if self.nn_baseline:
            q_values = kwargs['q_values']
            targets = normalize(q_values, q_values.mean(), q_values.std())
            targets = ptu.from_numpy(targets)

            baseline_predictions = self.baseline(observations)
            baseline_predictions = baseline_predictions.squeeze()

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape

            baseline_loss = F.mse_loss(baseline_predictions, targets)

            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

            train_log['Training v Loss'] = ptu.to_numpy(baseline_loss)
        return train_log
Example #25
0
    def update(self, observations, actions, next_observations,
               data_statistics):
        """
        :param observations: numpy array of observations
        :param actions: numpy array of actions
        :param next_observations: numpy array of next observations
        :param data_statistics: A dictionary with the following keys (each with
        a numpy array as the value):
             - 'obs_mean'
             - 'obs_std'
             - 'acs_mean'
             - 'acs_std'
             - 'delta_mean'
             - 'delta_std'
        :return:
        """
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        next_observations = ptu.from_numpy(next_observations)
        data_statistics = {
            k: ptu.from_numpy(v)
            for k, v in data_statistics.items()
        }

        target = normalize(next_observations - observations,
                           data_statistics['delta_mean'],
                           data_statistics['delta_std'])
        _, pred_delta = self.forward(observations, actions, **data_statistics)
        loss = self.loss(pred_delta, target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return {
            'Training Loss': ptu.to_numpy(loss),
        }
Example #26
0
    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}

        if self.t > self.num_exploration_steps:
            # TODO: After exploration is over, set the actor to optimize the extrinsic critic
            #HINT: Look at method ArgMaxPolicy.set_critic
            self.actor.set_critic(self.exploitation_critic)

        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # Get Reward Weights
            # TODO: Get the current explore reward weight and exploit reward weight
            #       using the schedule's passed in (see __init__)
            # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0
            explore_weight = self.explore_weight_schedule.value(self.t)
            exploit_weight = self.exploit_weight_schedule.value(self.t)

            # Run Exploration Model #
            # TODO: Evaluate the exploration model on s' to get the exploration bonus
            # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude
            prediction_error = self.exploration_model.forward_np(next_ob_no)
            expl_bonus = utils.normalize(prediction_error,
                                         prediction_error.mean(),
                                         prediction_error.std())

            # Reward Calculations #
            # TODO: Calculate mixed rewards, which will be passed into the exploration critic
            # HINT: See doc for definition of mixed_reward
            mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n

            # TODO: Calculate the environment reward
            # HINT: For part 1, env_reward is just 're_n'
            #       After this, env_reward is 're_n' shifted by self.exploit_rew_shift,
            #       and scaled by self.exploit_rew_scale
            env_reward = (re_n +
                          self.exploit_rew_shift) * self.exploit_rew_scale

            # Update Critics And Exploration Model #

            # TODO 1): Update the exploration model (based off s')
            # TODO 2): Update the exploration critic (based off mixed_reward)
            # TODO 3): Update the exploitation critic (based off env_reward)
            expl_model_loss = self.exploration_model.update(next_ob_no)
            exploration_critic_loss = self.exploration_critic.update(
                ob_no, ac_na, next_ob_no, mixed_reward, terminal_n)
            exploitation_critic_loss = self.exploitation_critic.update(
                ob_no, ac_na, next_ob_no, env_reward, terminal_n)

            # Target Networks #
            if self.num_param_updates % self.target_update_freq == 0:
                # TODO: Update the exploitation and exploration target networks
                self.exploration_critic.update_target_network()
                self.exploitation_critic.update_target_network()

            # Logging #
            log['Exploration Critic Loss'] = exploration_critic_loss[
                'Training Loss']
            log['Exploitation Critic Loss'] = exploitation_critic_loss[
                'Training Loss']
            log['Exploration Model Loss'] = expl_model_loss

            # TODO: Uncomment these lines after completing cql_critic.py
            log['Exploitation Data q-values'] = exploitation_critic_loss[
                'Data q-values']
            log['Exploitation OOD q-values'] = exploitation_critic_loss[
                'OOD q-values']
            log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss']

            self.num_param_updates += 1

        self.t += 1
        return log
Example #27
0
  def update(self, observations, actions, advantages, q_values=None):
    observations = ptu.from_numpy(observations)
    actions = ptu.from_numpy(actions)
    advantages = ptu.from_numpy(advantages)

    # TODO done: compute the loss that should be optimized when training with policy gradient
    # HINT1: Recall that the expression that we want to MAXIMIZE
    # is the expectation over collected trajectories of:
    # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
    # HINT2: you will want to use the `log_prob` method on the distribution returned
    # by the `forward` method
    # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

    action_dist = self.forward(observations)
    if self.discrete:
      log_pi = action_dist.log_prob(actions)
    else:
      # distributions.Independent:
      # Reinterprets some of the batch dims of a distribution as event dims.
      # This is mainly useful for changing the shape of the result of log_prob.

      """from the experience from debugging,
      for lunarLander, action_dist.batch_shape = [5004]
        -> use it directly
      for invertedPendulum, action_dist.batch_shape = torch.Size([40070, 2]) 
        -> use action_dist_new, whose batch_shape = 40070
      """
      if len(action_dist.batch_shape) == 1:
        log_pi = action_dist.log_prob(actions)
      else:
        action_dist_new = distributions.Independent(action_dist, 1)
        log_pi = action_dist_new.log_prob(actions)

    # sums = [entry * adv for entry in log_pi for adv in advantages]
    # sums = ptu.from_numpy(sums)
    # log pi can be inf if using multivariate normal
    # what if log_pi element size is not 1?
    # sums = torch.mul(log_pi, advantages)  # ? is it the same as below? -- high chances that they are the same
    assert advantages.ndim == log_pi.ndim
    sums = advantages * log_pi
    # sums = torch.tensor(sums)
    # loss = sum(sums)
    loss = -torch.sum(sums)  # `optimizer.step()` MINIMIZES a loss but we want to MAXIMIZE expectation

    # TODO done: optimize `loss` using `self.optimizer`
    # HINT: remember to `zero_grad` first
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    '''
       File "/home/hawk/dl/cs285/hw2/homework_fall2020/hw2/cs285/policies/MLP_policy.py", line 172, in update
   sums = torch.mul(log_pi, advantages) # ? is it the same as below?
   RuntimeError: The size of tensor a (2) must match the size of tensor b (40006) at non-singleton dimension 1
   logged outputs to  /home/hawk/dl/cs285/hw2/homework_fall2020/hw2/cs285/scripts/../../data/q3_b40000_r0.005_LunarLanderContinuous-v2_21-09-2020_10-16-04
   '''
    if self.nn_baseline:
      ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
      ## HINT: there is a `normalize` function in `infrastructure.utils`
      targets = utils.normalize(q_values, np.mean(q_values), np.std(q_values))
      targets = ptu.from_numpy(targets)

      ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
      baseline_predictions = self.baseline.forward(observations)

      ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
      ## [ N ] versus shape [ N x 1 ]
      ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
      # TODO ? move squeeze into model.forward
      baseline_predictions = baseline_predictions.squeeze()
      assert baseline_predictions.shape == targets.shape, "{} vs {}".format(baseline_predictions.shape,
                                                                            targets.shape)

      # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
      # HINT: use `F.mse_loss`
      baseline_loss = F.mse_loss(baseline_predictions, targets)

      # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
      # HINT: remember to `zero_grad` first
      self.baseline_optimizer.zero_grad()
      baseline_loss.backward()
      self.baseline_optimizer.step()

    train_log = {
      'Training Loss': ptu.to_numpy(loss),
    }
    return train_log
    def update(self, observations, actions, advantages=None, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)  #advantages=(Q_t - b_t)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE is the expectation over collected trajectories of: sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        #We max gradient of Cumulative rewards J(to take a step towards steepest direction) instead of J itself because it is hard to max J directly.

        # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `self.forward` method above
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        #compute log pi(a_t|s_t)
        log_pi = self.forward(observations).log_prob(actions)

        #use Back propagation tool to help us to compute policy gradient: the pseudo-loss is policy gradient without gradient -> the gradient of pseudo-loss is equal to the policy gradient.
        #the pseudo-loss is a weighted maximum likelihood, where the weight is advantages (reward to go with baseline), i.e., Q = q_value-baseline
        # use Minus - transform Gradient decent -> accent

        #it doesn't matter to use double mean or sum for tensor instead of mean and sum, because optimaser will adapt to it.
        #compute pseudo-loss sum_{t=0}^{T-1} [log pi(a_t|s_t) * (q_t - b_t)]
        loss = torch.neg(torch.mean(torch.mul(log_pi, advantages)))

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            #most common choice of baseline is the on-policy value function V^pi(s_t) i.e., average return an agent gets if it starts in state s_t (Reward to go, i.e.,q_value)

            # TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            '''
            why normalize q_values first as targets of baseline?
            '''
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            targets = utils.normalize(q_values, np.mean(q_values),
                                      np.std(q_values))
            targets = ptu.from_numpy(targets)

            # TODO: use the `forward` method of `self.baseline` to get baseline predictions

            #self.baseline is approximated by a neural network, which is updated concurrently with the policy
            baseline_predictions = self.baseline.forward(
                observations).squeeze()

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape

            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`

            #simplest method for learning baseline is minimize MSE.
            baseline_loss = self.baseline_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
    def update(self, observations, actions, adv_n=None):
        # TODO_: update the policy and return the loss
        # loss = TODO_
        # return loss.item()

        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(adv_n)

        # TODO_: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
            # is the expectation over collected trajectories of:
            # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
            # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss

        if self.discrete :
            log_prob = self.forward(observations).log_prob(actions)
        else:
            log_prob = utils.multivariate_normal_diag(loc = self.forward(observations), scale_diag=torch.exp(self.logstd)).log_prob(actions)

        if self.nn_baseline:
            loss = torch.mean(log_prob * (torch.squeeze(self.baseline(observations)) - ptu.from_numpy(q_values)))
        else:
            loss = - 1.0 * torch.mean(log_prob * advantages)

        # import pdb; pdb.set_trace()


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # TODO_: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        # TODO_

        if self.nn_baseline:
            ## TODO_: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            targets = utils.normalize(q_values, 0, 1)
            targets = ptu.from_numpy(targets)

            ## TODO_: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = torch.squeeze(self.baseline(observations))
            
            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape
            
            # TODO_: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = self.baseline_loss(targets, baseline_predictions)

            # TODO_: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        return loss.item()
Example #30
0
    def define_train_op(self):

        # normalize the labels
        # TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl
        # DONE
        self.delta_labels_normalized = normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl)