def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.num_critic_updates_per_agent_update = agent_params[
            'num_critic_updates_per_agent_update']
        self.num_actor_updates_per_agent_update = agent_params[
            'num_actor_updates_per_agent_update']
        self.device = agent_params['device']

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['device'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )
        # introduced in actor-critic to improve advantage function.
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()
Exemple #2
0
    def __init__(self, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']
        self.gae = self.agent_params['gae']
        self.lamb = self.agent_params['lambda']

        # actor/policy
        self.actor = MLPPolicyPG(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            nn_baseline=self.agent_params['nn_baseline'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)
Exemple #3
0
    def __init__(self, env, agent_params):
        super(TRPOAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.use_gae = self.agent_params['use_gae']
        self.lam = self.agent_params['gae_lam']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        # actor/policy
        self.actor = TRPOPolicy(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['cg_steps'],
            self.agent_params['damping'],
            self.agent_params['max_backtracks'],
            self.agent_params['max_kl_increment'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )

        self.critic = TRPOCritic(self.agent_params)

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)
Exemple #4
0
    def __init__(self, env, agent_params):
        super(PPOAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.use_gae = self.agent_params['use_gae']
        self.lam = self.agent_params['gae_lam']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.ppo_epochs = self.agent_params['ppo_epochs']
        self.ppo_min_bacth_size = self.agent_params['ppo_min_batch_size']

        # actor/policy
        self.actor = PPOPolicy(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['clip_eps'],
            self.agent_params['ent_coeff'],
            self.agent_params['max_grad_norm'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate_policyfn'],
        )

        self.critic = PPOCritic(self.agent_params)

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)
    def __init__(self, env, agent_params):
        super(MBAgent, self).__init__()

        self.env = env.unwrapped
        self.agent_params = agent_params
        self.ensemble_size = self.agent_params['ensemble_size']

        self.dyn_models = []
        for i in range(self.ensemble_size):
            model = FFModel(
                self.agent_params['ac_dim'],
                self.agent_params['ob_dim'],
                self.agent_params['n_layers'],
                self.agent_params['size'],
                self.agent_params['learning_rate'],
            )
            self.dyn_models.append(model)

        self.actor = MPCPolicy(
            self.env,
            ac_dim=self.agent_params['ac_dim'],
            dyn_models=self.dyn_models,
            horizon=self.agent_params['mpc_horizon'],
            N=self.agent_params['mpc_num_action_sequences'],
        )

        self.replay_buffer = ReplayBuffer()
Exemple #6
0
    def __init__(self, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params["gamma"]
        self.standardize_advantages = self.agent_params[
            "standardize_advantages"]
        self.nn_baseline = self.agent_params["nn_baseline"]
        self.reward_to_go = self.agent_params["reward_to_go"]

        # actor/policy
        self.actor = MLPPolicyPG(
            self.agent_params["ac_dim"],
            self.agent_params["ob_dim"],
            self.agent_params["n_layers"],
            self.agent_params["size"],
            discrete=self.agent_params["discrete"],
            learning_rate=self.agent_params["learning_rate"],
            nn_baseline=self.agent_params["nn_baseline"],
        )

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)
    def __init__(self, env, agent_params):
        super().__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']
        self.n_drivers = self.agent_params['n_drivers']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size_ac'],
            self.agent_params['shared_exp'],
            self.agent_params['shared_exp_lambda'],
            self.agent_params['is_city'],
            self.agent_params['learning_rate'],
            self.agent_params['n_drivers']
        )

        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])

        self.loss = tf.keras.losses.MeanSquaredError()
        self.optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.agent_params['learning_rate'])

        self.actor.compile(optimizer=self.optimizer, loss=self.loss)

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'])
    def __init__(self, env, agent_params, **kwargs):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])
        self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate'])

        self.critic = BootstrappedContinuousCritic(self.agent_params)
        self.critic_loss = tf.keras.losses.MeanSquaredError()
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate'])
        self.critic.nn_critic.compile(optimizer=self.critic_optimizer, loss=self.critic_loss)

        self.replay_buffer = ReplayBuffer()
Exemple #10
0
    def __init__(self, sess, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.sess = sess
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1)
        # which indicates similar network structure (layout/inputs/outputs),
        # but differences in training procedure
        # between supervised learning and policy gradients
        self.actor = MLPPolicyPG(
            sess,
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            nn_baseline=self.agent_params['nn_baseline'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)
Exemple #11
0
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            siren=self.agent_params['siren'],
            train_separate_offset=self.agent_params['train_separate_params'],
            supervision_mode=self.agent_params['supervision_mode'],
            offset_learning_rate=self.agent_params['offset_learning_rate'],
            auto_cast=self.agent_params['auto_cast'],
            gradient_loss_scale=self.agent_params['gradient_loss_scale'],
            additional_activation=self.agent_params['additional_activation'],
            omega=self.agent_params['omega'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'],
            epsilon_s=self.agent_params['epsilon_s'])
    def __init__(self, env, agent_params):
        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               self.agent_params['device'],
                               discrete = self.agent_params['discrete'],
                               learning_rate = self.agent_params['learning_rate'],
                               ) ## TODO: look in here and implement this

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(
            self.agent_params["ac_dim"],
            self.agent_params["ob_dim"],
            self.agent_params["n_layers"],
            self.agent_params["size"],
            discrete=self.agent_params["discrete"],
            learning_rate=self.agent_params["learning_rate"],
        )

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.agent_params["max_replay_buffer_size"])
Exemple #14
0
  def __init__(self, env, agent_params):
    super(BCAgent, self).__init__()

    # init vars
    self.env = env
    self.agent_params = agent_params

    # actor/policy
    self.actor = MLPPolicySL(
      self.agent_params['ac_dim'],
      self.agent_params['ob_dim'],
      self.agent_params['n_layers'],
      self.agent_params['size'],
      discrete=self.agent_params['discrete'],
      learning_rate=self.agent_params['learning_rate'],
    )

    # replay buffer
    self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
Exemple #15
0
    def __init__(self, env, agent_params, batch_size=500000, **kwargs):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])
        self.policy_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.agent_params['learning_rate'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(2 * batch_size)

        self.baseline_model = None
        if self.agent_params['nn_baseline']:
            self.baseline_model = build_mlp(
                (self.agent_params['ob_dim'], ),
                output_size=1,
                n_layers=self.agent_params['n_layers'],
                size=self.agent_params['size'],
                name='baseline_model')
            self.baseline_loss = tf.keras.losses.MeanSquaredError()
            self.baseline_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.agent_params['learning_rate'])
            self.baseline_model.compile(optimizer=self.baseline_optimizer,
                                        loss=self.baseline_loss)
Exemple #16
0
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params["gamma"]
        self.standardize_advantages = self.agent_params["standardize_advantages"]

        self.actor = MLPPolicyAC(
            self.agent_params["ac_dim"],
            self.agent_params["ob_dim"],
            self.agent_params["n_layers"],
            self.agent_params["size"],
            self.agent_params["discrete"],
            self.agent_params["learning_rate"],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()
Exemple #17
0
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()
    def __init__(self, sess, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.sess = sess
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicySL(sess,
                               self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete = self.agent_params['discrete'],
                               learning_rate = self.agent_params['learning_rate'],
                               ) ## TODO: look in here and implement this  --> FINISHED

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
Exemple #19
0
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        self.device = torch.device("cpu")  # added by fangda @ 2020/9/20
        # actor/policy
        self.actor = MLPPolicySL(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.device,  #Add by Fangda 2020/9/20
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )  ## TODO: look in here and implement this

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'])
Exemple #20
0
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.gae = self.agent_params['gae']
        self.gae_lambda = self.agent_params['gae_lambda']
        self.ppo = self.agent_params['ppo']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
            self.agent_params['clip_eps'],
        )

        if self.ppo:
            self.old_actor = MLPPolicyAC(
                self.agent_params['ac_dim'],
                self.agent_params['ob_dim'],
                self.agent_params['n_layers'],
                self.agent_params['size'],
                self.agent_params['discrete'],
                self.agent_params['learning_rate'],
                self.agent_params['clip_eps'],
            )
            self.old_actor.load_state_dict(self.actor.state_dict())

        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()
    def __init__(self, env, agent_params):
        super(BCAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params

        # actor/policy
        self.actor = MLPPolicy(self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete=self.agent_params['discrete']
                               )  # TODO: look in here and implement this

        self.loss = tf.keras.losses.MeanSquaredError()
        self.optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.agent_params['learning_rate'])

        self.actor.compile(optimizer=self.optimizer, loss=self.loss)

        # replay buffer
        self.replay_buffer = ReplayBuffer(
            self.agent_params['max_replay_buffer_size'])
 def clear_buffer(self):
     self.replay_buffer = ReplayBuffer(
         self.agent_params['max_replay_buffer_size'])