class BCAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete = self.agent_params['discrete'], learning_rate = self.agent_params['learning_rate'], ) ## TODO: look in here and implement this --> FINISHED # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size']) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels self.actor.update(ob_no, ac_na) ## TODO: look in here and implement this --> FINISHED def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data(batch_size) ## TODO: look in here and implement this --> FINISHED
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], discrete=self.agent_params["discrete"], learning_rate=self.agent_params["learning_rate"], ) # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params["max_replay_buffer_size"]) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels log = self.actor.update(ob_no, ac_na) return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data(batch_size) def save(self, path): return self.actor.save(path)
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.loss = tf.keras.losses.MeanSquaredError() self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.actor.compile(optimizer=self.optimizer, loss=self.loss) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size']) def train_multi_iter(self, batch_size, num_iters): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(self.replay_buffer.obs, tf.float32), tf.cast(self.replay_buffer.acs, tf.float32))) dataset = dataset.shuffle(self.replay_buffer.obs.shape[0]) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True).repeat() self.actor.fit(dataset, epochs=1, steps_per_epoch=num_iters) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels with tf.GradientTape() as tape: pred_actions = self.actor(ob_no) loss_value = self.loss(ac_na, pred_actions) trainable_vars = self.actor.trainable_variables grads = tape.gradient(loss_value, trainable_vars) self.optimizer.apply_gradients(zip(grads, trainable_vars)) return loss_value def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data( batch_size) ## TODO: look in here and implement this
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], siren=self.agent_params['siren'], train_separate_offset=self.agent_params['train_separate_params'], supervision_mode=self.agent_params['supervision_mode'], offset_learning_rate=self.agent_params['offset_learning_rate'], auto_cast=self.agent_params['auto_cast'], gradient_loss_scale=self.agent_params['gradient_loss_scale'], additional_activation=self.agent_params['additional_activation'], omega=self.agent_params['omega']) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size'], epsilon_s=self.agent_params['epsilon_s']) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n, gradients): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels log = self.actor.update( ob_no, ac_na, gradients=gradients) # HW1: you will modify this return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data( batch_size) # HW1: you will modify this def save(self, path): return self.actor.save(path)
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size']) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): """ Update actor policy by supervised learning, given observations and action labels. - ob_no: Observations. - ac_na: Action lables - re_n: ? - next_ob_no: ? - terminal_n: ? """ log = self.actor.update(ob_no, ac_na) return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data( batch_size) # HW1: you will modify this def save(self, path): return self.actor.save(path)
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size']) def train(self, ob_no, ac_na, re_n=None, next_ob_no=None, terminal_n=None): ''' self.actor.update( self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None ) ''' # training a BC agent refers to updating its actor using # the given observations and corresponding action labels(? expert data) log = self.actor.update(ob_no, ac_na) # HW1: you will modify this return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data(batch_size) # HW1: you will modify this def save(self, path): return self.actor.save(path)
class MBAgent(BaseAgent): def __init__(self, env, agent_params): super(MBAgent, self).__init__() self.env = env.unwrapped self.agent_params = agent_params self.ensemble_size = self.agent_params['ensemble_size'] self.dyn_models = [] for i in range(self.ensemble_size): model = FFModel( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['learning_rate'], ) self.dyn_models.append(model) self.actor = MPCPolicy( self.env, ac_dim=self.agent_params['ac_dim'], dyn_models=self.dyn_models, horizon=self.agent_params['mpc_horizon'], N=self.agent_params['mpc_num_action_sequences'], ) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a MB agent refers to updating the predictive model using observed state transitions # NOTE: each model in the ensemble is trained on a different random batch of size batch_size losses = [] num_data = ob_no.shape[0] num_data_per_ens = int(num_data / self.ensemble_size) for i in range(self.ensemble_size): # select which datapoints to use for this model of the ensemble # you might find the num_data_per_ens variable defined above useful start_idx = i * num_data_per_ens end_idx = (i + 1) * num_data_per_ens observations = ob_no[start_idx:end_idx] # TODO(Q1) actions = ac_na[start_idx:end_idx] # TODO(Q1) next_observations = next_ob_no[start_idx:end_idx] # TODO(Q1) # use datapoints to update one of the dyn_models model = self.dyn_models[i] # TODO(Q1) log = model.update(observations, actions, next_observations, self.data_statistics) loss = log['Training Loss'] losses.append(loss) avg_loss = np.mean(losses) return { 'Training Loss': avg_loss, } def add_to_replay_buffer(self, paths, add_sl_noise=False): # add data to replay buffer self.replay_buffer.add_rollouts(paths, noised=add_sl_noise) # get updated mean/std of the data in our replay buffer self.data_statistics = { 'obs_mean': np.mean(self.replay_buffer.obs, axis=0), 'obs_std': np.std(self.replay_buffer.obs, axis=0), 'acs_mean': np.mean(self.replay_buffer.acs, axis=0), 'acs_std': np.std(self.replay_buffer.acs, axis=0), 'delta_mean': np.mean(self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0), 'delta_std': np.std(self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0), } # update the actor's data_statistics too, so actor.get_action can be calculated correctly self.actor.data_statistics = self.data_statistics def sample(self, batch_size): # NOTE: sampling batch_size * ensemble_size, # so each model in our ensemble can get trained on batch_size data return self.replay_buffer.sample_random_data(batch_size * self.ensemble_size)