Esempio n. 1
0
    def __init__(self, sess, env, agent_params):
        super(MBAgent, self).__init__()

        self.env = env.unwrapped
        self.sess = sess
        self.agent_params = agent_params
        self.ensemble_size = self.agent_params['ensemble_size']

        self.dyn_models = []
        for i in range(self.ensemble_size):
            model = FFModel(sess,
                            self.agent_params['ac_dim'],
                            self.agent_params['ob_dim'],
                            self.agent_params['n_layers'],
                            self.agent_params['size'],
                            self.agent_params['learning_rate'],
                            scope='dyn_model_{}'.format(i))
            self.dyn_models.append(model)

        self.actor = MPCPolicy(
            sess,
            self.env,
            ac_dim=self.agent_params['ac_dim'],
            dyn_models=self.dyn_models,
            horizon=self.agent_params['mpc_horizon'],
            N=self.agent_params['mpc_num_action_sequences'],
        )

        self.replay_buffer = ReplayBuffer()
Esempio n. 2
0
    def __init__(self, env, agent_params):
        super(MBAgent, self).__init__()

        self.env = env.unwrapped
        self.agent_params = agent_params
        self.ensemble_size = self.agent_params['ensemble_size']

        self.dyn_models = []
        for i in range(self.ensemble_size):
            model = FFModel(
                self.agent_params['ac_dim'],
                self.agent_params['ob_dim'],
                self.agent_params['n_layers'],
                self.agent_params['size'],
                self.agent_params['learning_rate'],
            )
            self.dyn_models.append(model)

        self.actor = MPCPolicy(
            self.env,
            ac_dim=self.agent_params['ac_dim'],
            dyn_models=self.dyn_models,
            horizon=self.agent_params['mpc_horizon'],
            N=self.agent_params['mpc_num_action_sequences'],
            sample_strategy=self.agent_params['mpc_action_sampling_strategy'],
            cem_iterations=self.agent_params['cem_iterations'],
            cem_num_elites=self.agent_params['cem_num_elites'],
            cem_alpha=self.agent_params['cem_alpha'],
        )

        self.replay_buffer = ReplayBuffer()
Esempio n. 3
0
    def calculate_sum_of_rewards(self, obs, candidate_action_sequences,
                                 model: FFModel):
        """
    :param obs: numpy array with the *current observation*. Shape [D_obs]
    :param candidate_action_sequences: numpy array with the candidate action
    sequences. Shape [N, H, D_action] where
        - N is the number of action sequences considered
        - H is the horizon
        - D_action is the action of the dimension
    :param model: The current dynamics model.
    :return: numpy array with the sum of rewards for each action sequence.
    The array should have shape [N].


    # You should sum across `self.horizon` time step.
    # Hint: you should use model.get_prediction and you shouldn't need
    #       to import pytorch in this file.
    # Hint: Remember that the model can process observations and actions
    #       in batch, which can be much faster than looping through each
    #       action sequence.

    """
        N = candidate_action_sequences.shape[0]
        H = candidate_action_sequences.shape[1]
        # For each candidate action sequence, predict a sequence of
        # states for each dynamics model in your ensemble.
        predicted_obs = np.zeros(shape=(N, H, obs.shape[0]))
        predicted_obs_after_step_i = None
        for step_i in range(H):  # iterate in one step in horizon H
            # x=array([1, 2]); np.repeat(x[np.newaxis,:],3,axis=0)--> array([[1, 2],[1, 2],[1, 2]])
            if predicted_obs_after_step_i is None:
                obs_batch = np.repeat(obs[np.newaxis, :], N, axis=0)
            else:
                obs_batch = predicted_obs_after_step_i
            action_batch = candidate_action_sequences[:, step_i, :]
            assert action_batch.shape[0] == obs_batch.shape[0]
            predicted_obs_after_step_i = model.get_prediction(
                obs_batch, action_batch, self.data_statistics)
            assert predicted_obs[:,
                                 step_i, :].shape == predicted_obs_after_step_i.shape
            predicted_obs[:, step_i, :] = predicted_obs_after_step_i

        # Once you have a sequence of predicted states from each model in
        # your ensemble, calculate the sum of rewards for each sequence
        # using `self.env.get_reward(predicted_obs)`
        sum_of_rewards = np.zeros(shape=N)
        for action_sequence_i in range(N):
            observations = predicted_obs[action_sequence_i]
            actions = candidate_action_sequences[action_sequence_i]
            # TODO check whether `done` needs to be used?
            r_total_list, _ = self.env.get_reward(
                observations,
                actions)  # actually `actions` is not used in `get_reward`!!
            sum_of_rewards[action_sequence_i] = sum(r_total_list)

        return sum_of_rewards
Esempio n. 4
0
    def calculate_sum_of_rewards(
        self,
        obs: np.ndarray,
        candidate_action_sequences: np.ndarray,
        model: FFModel,
    ):
        """

        :param obs: numpy array with the current observation. Shape [D_obs]
        :param candidate_action_sequences: numpy array with the candidate action
        sequences. Shape [N, H, D_action] where
            - N is the number of action sequences considered
            - H is the horizon
            - D_action is the action of the dimension
        :param model: The current dynamics model.
        :return: numpy array with the sum of rewards for each action sequence.
        The array should have shape [N].
        """
        # For each candidate action sequence, predict a sequence of
        # states for each dynamics model in your ensemble.
        # Once you have a sequence of predicted states from each model in
        # your ensemble, calculate the sum of rewards for each sequence
        # using `self.env.get_reward(predicted_obs)`
        # You should sum across `self.horizon` time step.
        # Hint: you should use model.get_prediction and you shouldn't need
        #       to import pytorch in this file.
        # Hint: Remember that the model can process observations and actions
        #       in batch, which can be much faster than looping through each
        #       action sequence.

        N, H, _ = candidate_action_sequences.shape

        pred_obs = np.zeros((N, H, self.ob_dim))
        pred_obs[:, 0] = np.tile(obs[None, :], (N, 1))
        rewards = np.zeros((N, H))
        for t in range(H):
            rewards[:, t], _ = self.env.get_reward(
                pred_obs[:, t], candidate_action_sequences[:, t])
            if t < H - 1:
                pred_obs[:, t + 1] = model.get_prediction(
                    pred_obs[:, t],
                    candidate_action_sequences[:, t],
                    self.data_statistics,
                )

        sum_of_rewards = rewards.sum(axis=1)
        assert sum_of_rewards.shape == (N, )
        return sum_of_rewards
Esempio n. 5
0
    def calculate_sum_of_rewards_new(self, obs, candidate_action_sequences,
                                     model: FFModel):
        """
    :param obs: numpy array with the *current observation*. Shape [D_obs]
    :param candidate_action_sequences: numpy array with the candidate action
    sequences. Shape [N, H, D_action] where
        - N is the number of action sequences considered
        - H is the horizon
        - D_action is the action of the dimension
    :param model: The current dynamics model.
    :return: numpy array with the sum of rewards for each action sequence.
    The array should have shape [N].


    # You should sum across `self.horizon` time step.
    # Hint: you should use model.get_prediction and you shouldn't need
    #       to import pytorch in this file.
    # Hint: Remember that the model can process observations and actions
    #       in batch, which can be much faster than looping through each
    #       action sequence.

    """
        N = candidate_action_sequences.shape[0]
        assert self.N == N
        H = candidate_action_sequences.shape[1]
        sum_of_rewards = np.zeros(shape=N)

        # For each candidate action sequence, predict a sequence of  states for each dynamics model in your ensemble.
        # Once you have a sequence of predicted states from each model in your ensemble, calculate the sum of rewards for each sequence using `self.env.get_reward(predicted_obs)`
        obs = np.repeat(obs[np.newaxis, :], N, axis=0)
        for t in range(H):
            obs_predicted = model.get_prediction(
                obs, candidate_action_sequences[:, t, :], self.data_statistics)
            rewards, dones = self.env.get_reward(
                obs_predicted,
                np.zeros(shape=1))  # reward of coming to the next state
            sum_of_rewards += rewards  # use and only use one obs to determine reward. can also work in batch mode

        return sum_of_rewards