def get(self): """ Call this at the end of an epoch to get all of the data from the buffer, with advantages appropriately normalized (shifted to have mean zero and std one). Also, resets some pointers in the buffer. """ assert self.size == self.max_size # buffer has to be full before you can get # Reset pointers so next epoch overwrites buffers self.size = 0 self.ptr = np.zeros((self.num_agents,), dtype=np.int) self.path_start_idx = np.zeros((self.num_agents,), dtype=np.int) # Concatenate agents' episodes obs_buf = self.obs_buf.reshape(core.combined_shape(self.num_agents * self.n_size, self.obs_dim)) act_buf = self.act_buf.reshape(core.combined_shape(self.num_agents * self.n_size, self.act_dim)) ret_buf = self.ret_buf.flatten() logp_buf = self.logp_buf.flatten() adv_buf = self.adv_buf.flatten() # the next two lines implement the advantage normalization trick adv_mean, adv_std = mpi_statistics_scalar(adv_buf) adv_buf = (adv_buf - adv_mean) / adv_std data = dict(obs=obs_buf, act=act_buf, ret=ret_buf, adv=adv_buf, logp=logp_buf) # TODO: See if we are copying below if we run into memory issues return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95, do_masking=False, num_cands=37): self.do_masking = do_masking self.possible_pivot_size = num_cands self.obs_buf = (np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32),np.zeros(core.combined_shape(size, self.possible_pivot_size), dtype=np.float32)) \ if self.do_masking else np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, 1), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def __init__(self, obs_dim, act_dim, max_size, gamma=0.99, lam=0.95, num_agents=1, shift_advs_pct=0.0): print(f'Max buffer size {max_size} - # agents {num_agents}') assert max_size % num_agents == 0 n_size = max_size // num_agents self.obs_buf = np.zeros(core.combined_shape(num_agents, core.combined_shape(n_size, obs_dim)), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(num_agents, core.combined_shape(n_size, act_dim)), dtype=np.float32) self.adv_buf = np.zeros((num_agents, n_size), dtype=np.float32) self.rew_buf = np.zeros((num_agents, n_size), dtype=np.float32) self.ret_buf = np.zeros((num_agents, n_size), dtype=np.float32) self.val_buf = np.zeros((num_agents, n_size), dtype=np.float32) self.logp_buf = np.zeros((num_agents, n_size), dtype=np.float32) self.gamma = gamma self.lam = lam self.num_agents = num_agents self.ptr = np.zeros((num_agents,), dtype=np.int) self.path_start_idx = np.zeros((num_agents,), dtype=np.int) self.size = 0 self.max_size = max_size self.n_size = n_size self.obs_dim = obs_dim self.act_dim = act_dim self.shift_advs_pct = shift_advs_pct