def __init__(self, env, agent_params, **kwargs): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate']) self.critic = BootstrappedContinuousCritic(self.agent_params) self.critic_loss = tf.keras.losses.MeanSquaredError() self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate']) self.critic.nn_critic.compile(optimizer=self.critic_optimizer, loss=self.critic_loss) self.replay_buffer = ReplayBuffer()
def __init__(self, env, agent_params): super(TRPOAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.use_gae = self.agent_params['use_gae'] self.lam = self.agent_params['gae_lam'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] # actor/policy self.actor = TRPOPolicy( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['cg_steps'], self.agent_params['damping'], self.agent_params['max_backtracks'], self.agent_params['max_kl_increment'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = TRPOCritic(self.agent_params) # replay buffer self.replay_buffer = ReplayBuffer(1000000)
def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], siren=self.agent_params['siren'], train_separate_offset=self.agent_params['train_separate_params'], supervision_mode=self.agent_params['supervision_mode'], offset_learning_rate=self.agent_params['offset_learning_rate'], auto_cast=self.agent_params['auto_cast'], gradient_loss_scale=self.agent_params['gradient_loss_scale'], additional_activation=self.agent_params['additional_activation'], omega=self.agent_params['omega']) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size'], epsilon_s=self.agent_params['epsilon_s'])
def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] self.gae = self.agent_params['gae'] self.lamb = self.agent_params['lambda'] # actor/policy self.actor = MLPPolicyPG( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline']) # replay buffer self.replay_buffer = ReplayBuffer(1000000)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: DONE for _ in range(self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for _ in range(self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: DONE # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) V_s = self.critic.forward_np(ob_no) V_s_prime = self.critic.forward_np(next_ob_no) Q_s_a = re_n + self.gamma*V_s_prime*(1-terminal_n) adv_n = Q_s_a - V_s if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params[ 'num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params[ 'num_actor_updates_per_agent_update'] self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # introduced in actor-critic to improve advantage function. self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer()
def __init__(self, env, agent_params): super(MBAgent, self).__init__() self.env = env.unwrapped self.agent_params = agent_params self.ensemble_size = self.agent_params['ensemble_size'] self.dyn_models = [] for i in range(self.ensemble_size): model = FFModel( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['learning_rate'], ) self.dyn_models.append(model) self.actor = MPCPolicy( self.env, ac_dim=self.agent_params['ac_dim'], dyn_models=self.dyn_models, horizon=self.agent_params['mpc_horizon'], N=self.agent_params['mpc_num_action_sequences'], ) self.replay_buffer = ReplayBuffer()
class BCAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete = self.agent_params['discrete'], learning_rate = self.agent_params['learning_rate'], ) ## TODO: look in here and implement this --> FINISHED # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size']) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels self.actor.update(ob_no, ac_na) ## TODO: look in here and implement this --> FINISHED def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data(batch_size) ## TODO: look in here and implement this --> FINISHED
def __init__(self, sess, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1) # which indicates similar network structure (layout/inputs/outputs), # but differences in training procedure # between supervised learning and policy gradients self.actor = MLPPolicyPG( sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline']) # replay buffer self.replay_buffer = ReplayBuffer(1000000)
def __init__(self, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params["gamma"] self.standardize_advantages = self.agent_params[ "standardize_advantages"] self.nn_baseline = self.agent_params["nn_baseline"] self.reward_to_go = self.agent_params["reward_to_go"] # actor/policy self.actor = MLPPolicyPG( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], discrete=self.agent_params["discrete"], learning_rate=self.agent_params["learning_rate"], nn_baseline=self.agent_params["nn_baseline"], ) # replay buffer self.replay_buffer = ReplayBuffer(1000000)
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], discrete=self.agent_params["discrete"], learning_rate=self.agent_params["learning_rate"], ) # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params["max_replay_buffer_size"]) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels log = self.actor.update(ob_no, ac_na) return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data(batch_size) def save(self, path): return self.actor.save(path)
def __init__(self, env, agent_params): super().__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.n_drivers = self.agent_params['n_drivers'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size_ac'], self.agent_params['shared_exp'], self.agent_params['shared_exp_lambda'], self.agent_params['is_city'], self.agent_params['learning_rate'], self.agent_params['n_drivers'] ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer()
def __init__(self, env, agent_params): super(PPOAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.use_gae = self.agent_params['use_gae'] self.lam = self.agent_params['gae_lam'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.ppo_epochs = self.agent_params['ppo_epochs'] self.ppo_min_bacth_size = self.agent_params['ppo_min_batch_size'] # actor/policy self.actor = PPOPolicy( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['clip_eps'], self.agent_params['ent_coeff'], self.agent_params['max_grad_norm'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate_policyfn'], ) self.critic = PPOCritic(self.agent_params) # replay buffer self.replay_buffer = ReplayBuffer(1000000)
def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.loss = tf.keras.losses.MeanSquaredError() self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.actor.compile(optimizer=self.optimizer, loss=self.loss) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size'])
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.loss = tf.keras.losses.MeanSquaredError() self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.actor.compile(optimizer=self.optimizer, loss=self.loss) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size']) def train_multi_iter(self, batch_size, num_iters): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(self.replay_buffer.obs, tf.float32), tf.cast(self.replay_buffer.acs, tf.float32))) dataset = dataset.shuffle(self.replay_buffer.obs.shape[0]) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True).repeat() self.actor.fit(dataset, epochs=1, steps_per_epoch=num_iters) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels with tf.GradientTape() as tape: pred_actions = self.actor(ob_no) loss_value = self.loss(ac_na, pred_actions) trainable_vars = self.actor.trainable_variables grads = tape.gradient(loss_value, trainable_vars) self.optimizer.apply_gradients(zip(grads, trainable_vars)) return loss_value def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data( batch_size) ## TODO: look in here and implement this
class ACAgent: def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update'] self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC(self.agent_params['ob_dim'], self.agent_params['ac_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer(agent_params['replay_size']) def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) value = self.critic.value_func(ob).squeeze() next_value = self.critic.value_func(next_ob).squeeze() * (1 - done) adv_n = rew + (self.gamma * next_value) - value adv_n = adv_n.cpu().detach().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): loss = OrderedDict() for critic_update in range(self.num_critic_updates_per_agent_update): loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # put final critic loss here for actor_update in range(self.num_actor_updates_per_agent_update): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n) # put final actor loss here return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
def __init__(self, env, agent_params): # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete = self.agent_params['discrete'], learning_rate = self.agent_params['learning_rate'], ) ## TODO: look in here and implement this # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], siren=self.agent_params['siren'], train_separate_offset=self.agent_params['train_separate_params'], supervision_mode=self.agent_params['supervision_mode'], offset_learning_rate=self.agent_params['offset_learning_rate'], auto_cast=self.agent_params['auto_cast'], gradient_loss_scale=self.agent_params['gradient_loss_scale'], additional_activation=self.agent_params['additional_activation'], omega=self.agent_params['omega']) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size'], epsilon_s=self.agent_params['epsilon_s']) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n, gradients): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels log = self.actor.update( ob_no, ac_na, gradients=gradients) # HW1: you will modify this return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data( batch_size) # HW1: you will modify this def save(self, path): return self.actor.save(path)
def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], discrete=self.agent_params["discrete"], learning_rate=self.agent_params["learning_rate"], ) # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params["max_replay_buffer_size"])
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size']) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): """ Update actor policy by supervised learning, given observations and action labels. - ob_no: Observations. - ac_na: Action lables - re_n: ? - next_ob_no: ? - terminal_n: ? """ log = self.actor.update(ob_no, ac_na) return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data( batch_size) # HW1: you will modify this def save(self, path): return self.actor.save(path)
def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
def __init__(self, env, agent_params, batch_size=500000, **kwargs): super(PGAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy if self.agent_params['discrete']: self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) else: self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size']) self.policy_optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) # replay buffer self.replay_buffer = ReplayBuffer(2 * batch_size) self.baseline_model = None if self.agent_params['nn_baseline']: self.baseline_model = build_mlp( (self.agent_params['ob_dim'], ), output_size=1, n_layers=self.agent_params['n_layers'], size=self.agent_params['size'], name='baseline_model') self.baseline_loss = tf.keras.losses.MeanSquaredError() self.baseline_optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.baseline_model.compile(optimizer=self.baseline_optimizer, loss=self.baseline_loss)
def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params["gamma"] self.standardize_advantages = self.agent_params["standardize_advantages"] self.actor = MLPPolicyAC( self.agent_params["ac_dim"], self.agent_params["ob_dim"], self.agent_params["n_layers"], self.agent_params["size"], self.agent_params["discrete"], self.agent_params["learning_rate"], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer()
def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer()
def __init__(self, sess, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete = self.agent_params['discrete'], learning_rate = self.agent_params['learning_rate'], ) ## TODO: look in here and implement this --> FINISHED # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
class BCAgent(BaseAgent): def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # replay buffer self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size']) def train(self, ob_no, ac_na, re_n=None, next_ob_no=None, terminal_n=None): ''' self.actor.update( self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None ) ''' # training a BC agent refers to updating its actor using # the given observations and corresponding action labels(? expert data) log = self.actor.update(ob_no, ac_na) # HW1: you will modify this return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_random_data(batch_size) # HW1: you will modify this def save(self, path): return self.actor.save(path)
def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params self.device = torch.device("cpu") # added by fangda @ 2020/9/20 # actor/policy self.actor = MLPPolicySL( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.device, #Add by Fangda 2020/9/20 discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) ## TODO: look in here and implement this # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size'])
def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.gae = self.agent_params['gae'] self.gae_lambda = self.agent_params['gae_lambda'] self.ppo = self.agent_params['ppo'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) if self.ppo: self.old_actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) self.old_actor.load_state_dict(self.actor.state_dict()) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer()
def __init__(self, env, agent_params): super(BCAgent, self).__init__() # init vars self.env = env self.agent_params = agent_params # actor/policy self.actor = MLPPolicy(self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'] ) # TODO: look in here and implement this self.loss = tf.keras.losses.MeanSquaredError() self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.agent_params['learning_rate']) self.actor.compile(optimizer=self.optimizer, loss=self.loss) # replay buffer self.replay_buffer = ReplayBuffer( self.agent_params['max_replay_buffer_size'])
class PGAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(PGAgent, self).__init__() # init vars self.env = env self.sess = sess self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.nn_baseline = self.agent_params['nn_baseline'] self.reward_to_go = self.agent_params['reward_to_go'] # actor/policy # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1) # which indicates similar network structure (layout/inputs/outputs), # but differences in training procedure # between supervised learning and policy gradients self.actor = MLPPolicyPG( sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], nn_baseline=self.agent_params['nn_baseline']) # replay buffer self.replay_buffer = ReplayBuffer(1000000) def train(self, obs, acs, rews_list, next_obs, terminals): """ Training a PG agent refers to updating its actor using the given observations/actions and the calculated qvals/advantages that come from the seen rewards. ---------------------------------------------------------------------------------- Recall that the expression for the policy gradient PG is PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )] where tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory, Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), b_t is a baseline which may depend on s_t, and (Q_t - b_t ) is the advantage. Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t), and that is exactly what this function provides. ---------------------------------------------------------------------------------- """ # step 1: calculate q values of each (s_t, a_t) point, # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1}) q_values = self.calculate_q_vals(rews_list) # step 2: calculate advantages that correspond to each (s_t, a_t) point advantage_values = self.estimate_advantage(obs, q_values) # step 3: # TODO: pass the calculated values above into the actor/policy's update, # which will perform the actual PG update step loss = self.actor.update(obs, acs, qvals=TODO, adv_n=TODO) return loss def calculate_q_vals(self, rews_list): """ Monte Carlo estimation of the Q function. arguments: rews_list: length: number of sampled rollouts Each element corresponds to a particular rollout, and contains an array of the rewards for every step of that particular rollout returns: q_values: shape: (sum/total number of steps across the rollouts) Each entry corresponds to the estimated q(s_t,a_t) value of the corresponding obs/ac point at time t. """ # Case 1: trajectory-based PG if not self.reward_to_go: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1) # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate([TODO for r in rews_list]) # Case 2: reward-to-go PG else: # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory (from t to T-1) # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} # Hint3: see the helper functions at the bottom of this file q_values = np.concatenate([TODO for r in rews_list]) return q_values def estimate_advantage(self, obs, q_values): """ Computes advantages by (possibly) subtracting a baseline from the estimated Q values """ # TODO: Estimate the advantage when nn_baseline is True # HINT1: pass obs into the neural network that you're using to learn the baseline # extra hint if you're stuck: see your actor's run_baseline_prediction # HINT2: advantage should be [Q-b] if self.nn_baseline: b_n_unnormalized = TODO b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values) adv_n = TODO # Else, just set the advantage to [Q] else: adv_n = q_values.copy() # Normalize the resulting advantages if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n ##################################################### ##################################################### def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) ##################################################### ################## HELPER FUNCTIONS ################# ##################################################### # TODO: implement this function def _discounted_return(self, rewards): """ Helper function Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'} note that all entries of this output are equivalent because each index t is a sum from 0 to T-1 (and doesnt involve t) """ # 1) create a list of indices (t'): from 0 to T-1 indices = TODO # 2) create a list where the entry at each index (t') is gamma^(t') discounts = TODO # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'} discounted_rewards = TODO # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'} sum_of_discounted_rewards = TODO # 5) create a list of length T-1, where each entry t contains that scalar list_of_discounted_returns = TODO return list_of_discounted_returns def _discounted_cumsum(self, rewards): """ Input: a list of length T a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T Output: a list of length T a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} """ all_discounted_cumsums = [] # for loop over steps (t) of the given rollout for start_time_index in range(len(rewards)): # 1) create a list of indices (t'): goes from t to T-1 indices = TODO # 2) create a list where the entry at each index (t') is gamma^(t'-t) discounts = TODO # 3) create a list where the entry at each index (t') is gamma^(t'-t) * r_{t'} # Hint: remember that t' goes from t to T-1, so you should use the rewards from those indices as well discounted_rtg = TODO # 4) calculate a scalar: sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'} sum_discounted_rtg = TODO # appending each of these calculated sums into the list to return all_discounted_cumsums.append(sum_discounted_rtg) list_of_discounted_cumsums = np.array(all_discounted_cumsums) return list_of_discounted_cumsums