def __init__(self, sess, env, agent_params): super(MBAgent, self).__init__() self.env = env.unwrapped self.sess = sess self.agent_params = agent_params self.ensemble_size = self.agent_params['ensemble_size'] self.dyn_models = [] for i in range(self.ensemble_size): model = FFModel(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['learning_rate'], scope='dyn_model_{}'.format(i)) self.dyn_models.append(model) self.actor = MPCPolicy( sess, self.env, ac_dim=self.agent_params['ac_dim'], dyn_models=self.dyn_models, horizon=self.agent_params['mpc_horizon'], N=self.agent_params['mpc_num_action_sequences'], ) self.replay_buffer = ReplayBuffer()
def __init__(self, env, agent_params): super(MBAgent, self).__init__() self.env = env.unwrapped self.agent_params = agent_params self.ensemble_size = self.agent_params['ensemble_size'] self.dyn_models = [] for i in range(self.ensemble_size): model = FFModel( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['learning_rate'], ) self.dyn_models.append(model) self.actor = MPCPolicy( self.env, ac_dim=self.agent_params['ac_dim'], dyn_models=self.dyn_models, horizon=self.agent_params['mpc_horizon'], N=self.agent_params['mpc_num_action_sequences'], sample_strategy=self.agent_params['mpc_action_sampling_strategy'], cem_iterations=self.agent_params['cem_iterations'], cem_num_elites=self.agent_params['cem_num_elites'], cem_alpha=self.agent_params['cem_alpha'], ) self.replay_buffer = ReplayBuffer()
def calculate_sum_of_rewards(self, obs, candidate_action_sequences, model: FFModel): """ :param obs: numpy array with the *current observation*. Shape [D_obs] :param candidate_action_sequences: numpy array with the candidate action sequences. Shape [N, H, D_action] where - N is the number of action sequences considered - H is the horizon - D_action is the action of the dimension :param model: The current dynamics model. :return: numpy array with the sum of rewards for each action sequence. The array should have shape [N]. # You should sum across `self.horizon` time step. # Hint: you should use model.get_prediction and you shouldn't need # to import pytorch in this file. # Hint: Remember that the model can process observations and actions # in batch, which can be much faster than looping through each # action sequence. """ N = candidate_action_sequences.shape[0] H = candidate_action_sequences.shape[1] # For each candidate action sequence, predict a sequence of # states for each dynamics model in your ensemble. predicted_obs = np.zeros(shape=(N, H, obs.shape[0])) predicted_obs_after_step_i = None for step_i in range(H): # iterate in one step in horizon H # x=array([1, 2]); np.repeat(x[np.newaxis,:],3,axis=0)--> array([[1, 2],[1, 2],[1, 2]]) if predicted_obs_after_step_i is None: obs_batch = np.repeat(obs[np.newaxis, :], N, axis=0) else: obs_batch = predicted_obs_after_step_i action_batch = candidate_action_sequences[:, step_i, :] assert action_batch.shape[0] == obs_batch.shape[0] predicted_obs_after_step_i = model.get_prediction( obs_batch, action_batch, self.data_statistics) assert predicted_obs[:, step_i, :].shape == predicted_obs_after_step_i.shape predicted_obs[:, step_i, :] = predicted_obs_after_step_i # Once you have a sequence of predicted states from each model in # your ensemble, calculate the sum of rewards for each sequence # using `self.env.get_reward(predicted_obs)` sum_of_rewards = np.zeros(shape=N) for action_sequence_i in range(N): observations = predicted_obs[action_sequence_i] actions = candidate_action_sequences[action_sequence_i] # TODO check whether `done` needs to be used? r_total_list, _ = self.env.get_reward( observations, actions) # actually `actions` is not used in `get_reward`!! sum_of_rewards[action_sequence_i] = sum(r_total_list) return sum_of_rewards
def calculate_sum_of_rewards( self, obs: np.ndarray, candidate_action_sequences: np.ndarray, model: FFModel, ): """ :param obs: numpy array with the current observation. Shape [D_obs] :param candidate_action_sequences: numpy array with the candidate action sequences. Shape [N, H, D_action] where - N is the number of action sequences considered - H is the horizon - D_action is the action of the dimension :param model: The current dynamics model. :return: numpy array with the sum of rewards for each action sequence. The array should have shape [N]. """ # For each candidate action sequence, predict a sequence of # states for each dynamics model in your ensemble. # Once you have a sequence of predicted states from each model in # your ensemble, calculate the sum of rewards for each sequence # using `self.env.get_reward(predicted_obs)` # You should sum across `self.horizon` time step. # Hint: you should use model.get_prediction and you shouldn't need # to import pytorch in this file. # Hint: Remember that the model can process observations and actions # in batch, which can be much faster than looping through each # action sequence. N, H, _ = candidate_action_sequences.shape pred_obs = np.zeros((N, H, self.ob_dim)) pred_obs[:, 0] = np.tile(obs[None, :], (N, 1)) rewards = np.zeros((N, H)) for t in range(H): rewards[:, t], _ = self.env.get_reward( pred_obs[:, t], candidate_action_sequences[:, t]) if t < H - 1: pred_obs[:, t + 1] = model.get_prediction( pred_obs[:, t], candidate_action_sequences[:, t], self.data_statistics, ) sum_of_rewards = rewards.sum(axis=1) assert sum_of_rewards.shape == (N, ) return sum_of_rewards
def calculate_sum_of_rewards_new(self, obs, candidate_action_sequences, model: FFModel): """ :param obs: numpy array with the *current observation*. Shape [D_obs] :param candidate_action_sequences: numpy array with the candidate action sequences. Shape [N, H, D_action] where - N is the number of action sequences considered - H is the horizon - D_action is the action of the dimension :param model: The current dynamics model. :return: numpy array with the sum of rewards for each action sequence. The array should have shape [N]. # You should sum across `self.horizon` time step. # Hint: you should use model.get_prediction and you shouldn't need # to import pytorch in this file. # Hint: Remember that the model can process observations and actions # in batch, which can be much faster than looping through each # action sequence. """ N = candidate_action_sequences.shape[0] assert self.N == N H = candidate_action_sequences.shape[1] sum_of_rewards = np.zeros(shape=N) # For each candidate action sequence, predict a sequence of states for each dynamics model in your ensemble. # Once you have a sequence of predicted states from each model in your ensemble, calculate the sum of rewards for each sequence using `self.env.get_reward(predicted_obs)` obs = np.repeat(obs[np.newaxis, :], N, axis=0) for t in range(H): obs_predicted = model.get_prediction( obs, candidate_action_sequences[:, t, :], self.data_statistics) rewards, dones = self.env.get_reward( obs_predicted, np.zeros(shape=1)) # reward of coming to the next state sum_of_rewards += rewards # use and only use one obs to determine reward. can also work in batch mode return sum_of_rewards