class BCAgent(BaseAgent):
    def __init__(self, sess, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.sess = sess
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(sess,
                               self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete = self.agent_params['discrete'],
                               learning_rate = self.agent_params['learning_rate'],
                               ) ## TODO: look in here and implement this  --> FINISHED

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # training a BC agent refers to updating its actor using
        # the given observations and corresponding action labels
        self.actor.update(ob_no, ac_na) ## TODO: look in here and implement this --> FINISHED

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_random_data(batch_size) ## TODO: look in here and implement this  --> FINISHED
class BCAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(
            self.agent_params["ac_dim"],
            self.agent_params["ob_dim"],
            self.agent_params["n_layers"],
            self.agent_params["size"],
            discrete=self.agent_params["discrete"],
            learning_rate=self.agent_params["learning_rate"],
        )

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.agent_params["max_replay_buffer_size"])

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # training a BC agent refers to updating its actor using
        # the given observations and corresponding action labels
        log = self.actor.update(ob_no, ac_na)
        return log

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_random_data(batch_size)

    def save(self, path):
        return self.actor.save(path)
class BCAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])

        self.loss = tf.keras.losses.MeanSquaredError()
        self.optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.agent_params['learning_rate'])

        self.actor.compile(optimizer=self.optimizer, loss=self.loss)

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'])

    def train_multi_iter(self, batch_size, num_iters):
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(self.replay_buffer.obs,
                     tf.float32), tf.cast(self.replay_buffer.acs, tf.float32)))
        dataset = dataset.shuffle(self.replay_buffer.obs.shape[0])
        dataset = dataset.batch(batch_size=batch_size,
                                drop_remainder=True).repeat()
        self.actor.fit(dataset, epochs=1, steps_per_epoch=num_iters)

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # training a BC agent refers to updating its actor using
        # the given observations and corresponding action labels
        with tf.GradientTape() as tape:
            pred_actions = self.actor(ob_no)
            loss_value = self.loss(ac_na, pred_actions)
        trainable_vars = self.actor.trainable_variables
        grads = tape.gradient(loss_value, trainable_vars)
        self.optimizer.apply_gradients(zip(grads, trainable_vars))
        return loss_value

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_random_data(
            batch_size)  ## TODO: look in here and implement this
Exemple #4
0
class BCAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            siren=self.agent_params['siren'],
            train_separate_offset=self.agent_params['train_separate_params'],
            supervision_mode=self.agent_params['supervision_mode'],
            offset_learning_rate=self.agent_params['offset_learning_rate'],
            auto_cast=self.agent_params['auto_cast'],
            gradient_loss_scale=self.agent_params['gradient_loss_scale'],
            additional_activation=self.agent_params['additional_activation'],
            omega=self.agent_params['omega'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'],
            epsilon_s=self.agent_params['epsilon_s'])

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n, gradients):
        # training a BC agent refers to updating its actor using
        # the given observations and corresponding action labels
        log = self.actor.update(
            ob_no, ac_na, gradients=gradients)  # HW1: you will modify this
        return log

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_random_data(
            batch_size)  # HW1: you will modify this

    def save(self, path):
        return self.actor.save(path)
Exemple #5
0
class BCAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'])

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        """ Update actor policy by supervised learning,
            given observations and action labels.
        - ob_no: Observations.
        - ac_na: Action lables
        - re_n: ?
        - next_ob_no: ?
        - terminal_n: ?
        """
        log = self.actor.update(ob_no, ac_na)
        return log

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_random_data(
            batch_size)  # HW1: you will modify this

    def save(self, path):
        return self.actor.save(path)
Exemple #6
0
class BCAgent(BaseAgent):
  def __init__(self, env, agent_params):
    super(BCAgent, self).__init__()

    # init vars
    self.env = env
    self.agent_params = agent_params

    # actor/policy
    self.actor = MLPPolicySL(
      self.agent_params['ac_dim'],
      self.agent_params['ob_dim'],
      self.agent_params['n_layers'],
      self.agent_params['size'],
      discrete=self.agent_params['discrete'],
      learning_rate=self.agent_params['learning_rate'],
    )

    # replay buffer
    self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])

  def train(self, ob_no, ac_na, re_n=None, next_ob_no=None, terminal_n=None):
    '''
     self.actor.update(
        self, observations, actions,
        adv_n=None, acs_labels_na=None, qvals=None
    )
    '''
    # training a BC agent refers to updating its actor using
    # the given observations and corresponding action labels(? expert data)
    log = self.actor.update(ob_no, ac_na)  # HW1: you will modify this
    return log

  def add_to_replay_buffer(self, paths):
    self.replay_buffer.add_rollouts(paths)

  def sample(self, batch_size):
    return self.replay_buffer.sample_random_data(batch_size)  # HW1: you will modify this

  def save(self, path):
    return self.actor.save(path)
class MBAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(MBAgent, self).__init__()

        self.env = env.unwrapped
        self.agent_params = agent_params
        self.ensemble_size = self.agent_params['ensemble_size']

        self.dyn_models = []
        for i in range(self.ensemble_size):
            model = FFModel(
                self.agent_params['ac_dim'],
                self.agent_params['ob_dim'],
                self.agent_params['n_layers'],
                self.agent_params['size'],
                self.agent_params['learning_rate'],
            )
            self.dyn_models.append(model)

        self.actor = MPCPolicy(
            self.env,
            ac_dim=self.agent_params['ac_dim'],
            dyn_models=self.dyn_models,
            horizon=self.agent_params['mpc_horizon'],
            N=self.agent_params['mpc_num_action_sequences'],
        )

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        # training a MB agent refers to updating the predictive model using observed state transitions
        # NOTE: each model in the ensemble is trained on a different random batch of size batch_size
        losses = []
        num_data = ob_no.shape[0]
        num_data_per_ens = int(num_data / self.ensemble_size)

        for i in range(self.ensemble_size):

            # select which datapoints to use for this model of the ensemble
            # you might find the num_data_per_ens variable defined above useful

            start_idx = i * num_data_per_ens
            end_idx = (i + 1) * num_data_per_ens
            observations = ob_no[start_idx:end_idx]  # TODO(Q1)
            actions = ac_na[start_idx:end_idx]  # TODO(Q1)
            next_observations = next_ob_no[start_idx:end_idx]  # TODO(Q1)

            # use datapoints to update one of the dyn_models
            model = self.dyn_models[i]  # TODO(Q1)
            log = model.update(observations, actions, next_observations,
                               self.data_statistics)
            loss = log['Training Loss']
            losses.append(loss)

        avg_loss = np.mean(losses)
        return {
            'Training Loss': avg_loss,
        }

    def add_to_replay_buffer(self, paths, add_sl_noise=False):

        # add data to replay buffer
        self.replay_buffer.add_rollouts(paths, noised=add_sl_noise)

        # get updated mean/std of the data in our replay buffer
        self.data_statistics = {
            'obs_mean':
            np.mean(self.replay_buffer.obs, axis=0),
            'obs_std':
            np.std(self.replay_buffer.obs, axis=0),
            'acs_mean':
            np.mean(self.replay_buffer.acs, axis=0),
            'acs_std':
            np.std(self.replay_buffer.acs, axis=0),
            'delta_mean':
            np.mean(self.replay_buffer.next_obs - self.replay_buffer.obs,
                    axis=0),
            'delta_std':
            np.std(self.replay_buffer.next_obs - self.replay_buffer.obs,
                   axis=0),
        }

        # update the actor's data_statistics too, so actor.get_action can be calculated correctly
        self.actor.data_statistics = self.data_statistics

    def sample(self, batch_size):
        # NOTE: sampling batch_size * ensemble_size,
        # so each model in our ensemble can get trained on batch_size data
        return self.replay_buffer.sample_random_data(batch_size *
                                                     self.ensemble_size)