def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize( obs_unnormalized, self.obs_mean_pl, self.obs_std_pl ) # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl acs_normalized = normalize( acs_unnormalized, self.acs_mean_pl, self.acs_std_pl ) # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) self.delta_pred_normalized = build_mlp(concatenated_input, \ self.ob_dim, \ self.scope, \ self.n_layers, \ self.size) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state) self.delta_pred_unnormalized = unnormalize( self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl ) # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # Estimate the advantage when nn_baseline is True, # by querying the neural network that you're using to learn the baseline if self.nn_baseline: baselines_normalized = self.actor.run_baseline_prediction(obs) ## ensure that the baseline and q_values have the same dimensionality ## to prevent silent broadcasting errors assert baselines_normalized.ndim == q_values.ndim ## baseline was trained with standardized q_values, so ensure that the predictions ## have the same mean and standard deviation as the current batch of q_values baselines = utils.unnormalize( baselines_normalized, np.mean(q_values), np.std(q_values) ) ## TODO: compute advantage estimates using q_values and baselines advantages = q_values - baselines # Else, just set the advantage to [Q] else: advantages = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: ## TODO: standardize the advantages to have a mean of zero ## and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` advantages = utils.normalize( advantages, np.mean(advantages), np.std(advantages) ) return advantages
def _get_next_obs_prediction(self, observations, actions, data_statistics): delta_pred_unnormalized = unnormalize( self._forward_delta_pred_normalized(observations, actions, data_statistics), data_statistics['delta_mean'], data_statistics['delta_std'] ) # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl return observations + delta_pred_unnormalized # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ obs_unnormalized = ptu.from_numpy(obs_unnormalized) acs_unnormalized = ptu.from_numpy(acs_unnormalized) self.update_statistics(obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std) #testing (512,4) # obs_test = obs_unnormalized.reshape((self.obs_mean.shape[0])) # obs_test_normalized = (obs_test - self.obs_mean) / self.obs_std # tmp_mean = torch.mean(obs_test_normalized) # tmp_std = torch.std(obs_test_normalized) # normalize input data to mean 0, std 1 obs_normalized = normalize(obs_unnormalized, self.obs_mean, self.obs_std) acs_normalized = normalize(acs_unnormalized, self.acs_mean, self.acs_std) # predicted change in obs # concatenated_input = torch.cat([obs_normalized.expand(acs_normalized.shape[0], -1), acs_normalized], dim=1) concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) compute delta_pred_normalized and next_obs_pred # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). delta_pred_normalized = self.delta_network( concatenated_input) # TODO(Q1) delta_pred = unnormalize(delta_pred_normalized, self.delta_mean, self.delta_std) next_obs_pred = obs_unnormalized + delta_pred # TODO(Q1) return next_obs_pred, delta_pred_normalized
def define_forward_pass(self): # normalize input data to mean 0, std 1 obs_unnormalized = self.obs_pl acs_unnormalized = self.acs_pl # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl) acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl) # predicted change in obs concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1) # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s) self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size) self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl) self.next_obs_pred = self.obs_pl + self.delta_pred_unnormalized
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ # convert to tensors obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std = self.update_statistics(obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std) obs_unnormalized = ptu.from_numpy(obs_unnormalized) acs_unnormalized= ptu.from_numpy(acs_unnormalized) # normalize input data to mean 0, std 1 obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std)# TODO(Q1) acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std)# TODO(Q1) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) compute delta_pred_normalized and next_obs_pred # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). delta_pred_normalized = self.delta_network(concatenated_input)# TODO(Q1) next_obs_pred = obs_unnormalized + unnormalize(delta_pred_normalized, delta_mean, delta_std)# TODO(Q1) return next_obs_pred, delta_pred_normalized
def get_prediction(self, obs, acs, data_statistics): if len(obs.shape) == 1 or len(acs.shape) == 1: obs = np.squeeze(obs)[None] acs = np.squeeze(acs)[None] norm_obs = normalize(obs, data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(acs, data_statistics['acs_mean'], data_statistics['acs_std']) norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device) norm_delta = self.delta_func(norm_input).cpu().detach().numpy() delta = unnormalize(norm_delta, data_statistics['delta_mean'], data_statistics['delta_std']) return obs + delta
def forward( # input and output are both tensors self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std) acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) # TODO(Q1) done compute delta_pred_normalized and next_obs_pred # Hint: as described in the PDF, the output of the network is the # *normalized change* in state, i.e. normalized(s_t+1 - s_t). delta_pred_normalized = self.delta_network(concatenated_input) next_obs_pred = unnormalize(delta_pred_normalized, delta_mean, delta_std) + obs_unnormalized return next_obs_pred, delta_pred_normalized
def get_prediction(self, obs, acs, data_statistics): if len(obs.shape) == 1 or len(acs.shape) == 1: obs = np.squeeze(obs)[None] acs = np.squeeze(acs)[None] # DoneTODO(Q1) normalize the obs and acs above using the normalize function and data_statistics norm_obs = normalize(obs, data_statistics['obs_mean'], data_statistics['obs_std']) norm_acs = normalize(acs, data_statistics['acs_mean'], data_statistics['acs_std']) norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis=1)).to(self.device) norm_delta = self.delta_func(norm_input).cpu().detach().numpy() # DoneTODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics delta = unnormalize(norm_delta, data_statistics['delta_mean'], data_statistics['delta_std']) # DoneTODO(Q1) Return the predited next observation (You will use obs and delta) return obs + delta
def forward( self, obs_unnormalized, acs_unnormalized, obs_mean, obs_std, acs_mean, acs_std, delta_mean, delta_std, ): """ :param obs_unnormalized: Unnormalized observations :param acs_unnormalized: Unnormalized actions :param obs_mean: Mean of observations :param obs_std: Standard deviation of observations :param acs_mean: Mean of actions :param acs_std: Standard deviation of actions :param delta_mean: Mean of state difference `s_t+1 - s_t`. :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. :return: tuple `(next_obs_pred, delta_pred_normalized)` This forward function should return a tuple of two items 1. `next_obs_pred` which is the predicted `s_t+1` 2. `delta_pred_normalized` which is the normalized (i.e. not unnormalized) output of the delta network. This is needed """ # normalize input data to mean 0, std 1 obs_normalized = normalize(obs_unnormalized, obs_mean, obs_std) acs_normalized = normalize(acs_unnormalized, acs_mean, acs_std) # predicted change in obs concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) delta_pred_normalized = self.delta_network(concatenated_input) next_obs_pred = unnormalize(delta_pred_normalized, delta_mean, delta_std) + obs_unnormalized return next_obs_pred, delta_pred_normalized