Exemple #1
0
  def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
    """
    `obs_dim` = shape of tensor representing an observation of the agent
    `act_dim` = shape of tensor representing an action of the agent
    `size` = maximum number of steps for the trajectory, ie. max needed size of buffer
    `gamma` = discount factor
    `lam` = lambda value for temporal difference value function updating
      - for information, see 
      https://amreis.github.io/ml/reinf-learn/2017/11/02/reinforcement-learning-eligibility-traces.html
    """
    # lists of observations and actions taken
    self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
    self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)

    # lists of advantages, rewards, returns (ie. discounted sum of rewards)
    # value functions, and ??? `logp` for each time step
    # TODO: figure out what `logp` is
    self.adv_buf = np.zeros(size, dtype=np.float32)
    self.rew_buf = np.zeros(size, dtype=np.float32)
    self.ret_buf = np.zeros(size, dtype=np.float32)
    self.val_buf = np.zeros(size, dtype=np.float32)
    self.logp_buf = np.zeros(size, dtype=np.float32)

    self.gamma, self.lam = gamma, lam

    # ptr is a pointer to the next time slot in the arrays/buffers which
    # is going to be filled in
    # path_start_idx is a pointer to the time slot at which
    # the most recent (or present) path began
    # (in case the buffer contains information from multiple
    # paths)
    # max_size is the maximum capacity for the array
    self.ptr, self.path_start_idx, self.max_size = 0, 0, size
 def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
     self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
     self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
     self.adv_buf = np.zeros(size, dtype=np.float32)
     self.rew_buf = np.zeros(size, dtype=np.float32)
     self.ret_buf = np.zeros(size, dtype=np.float32)
     self.val_buf = np.zeros(size, dtype=np.float32)
     self.logp_buf = np.zeros(size, dtype=np.float32)
     self.gamma, self.lam = gamma, lam
     self.ptr, self.path_start_idx, self.max_size = 0, 0, size
Exemple #3
0
 def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
     self.obs_buf = np.zeros(core.combined_shape(size, obs_dim),
                             dtype=np.float32)
     self.act_buf = np.zeros(core.combined_shape(size, act_dim),
                             dtype=np.float32)
     self.adv_buf = np.zeros(size, dtype=np.float32)
     self.rew_buf = np.zeros(size, dtype=np.float32)
     self.ret_buf = np.zeros(size, dtype=np.float32)
     self.val_buf = np.zeros(size, dtype=np.float32)
     self.logp_buf = np.zeros(size, dtype=np.float32)
     self.gamma, self.lam = gamma, lam
     self.device = torch.device(
         "cuda") if torch.cuda.is_available() else torch.device("cpu")
     self.device = torch.device("cpu")
     self.ptr, self.path_start_idx, self.max_size = 0, 0, size