def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): """ `obs_dim` = shape of tensor representing an observation of the agent `act_dim` = shape of tensor representing an action of the agent `size` = maximum number of steps for the trajectory, ie. max needed size of buffer `gamma` = discount factor `lam` = lambda value for temporal difference value function updating - for information, see https://amreis.github.io/ml/reinf-learn/2017/11/02/reinforcement-learning-eligibility-traces.html """ # lists of observations and actions taken self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) # lists of advantages, rewards, returns (ie. discounted sum of rewards) # value functions, and ??? `logp` for each time step # TODO: figure out what `logp` is self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam # ptr is a pointer to the next time slot in the arrays/buffers which # is going to be filled in # path_start_idx is a pointer to the time slot at which # the most recent (or present) path began # (in case the buffer contains information from multiple # paths) # max_size is the maximum capacity for the array self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam self.device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") self.device = torch.device("cpu") self.ptr, self.path_start_idx, self.max_size = 0, 0, size