Ejemplo n.º 1
0
    def get(self):
        """
        Call this at the end of an epoch to get all of the data from
        the buffer, with advantages appropriately normalized (shifted to have
        mean zero and std one). Also, resets some pointers in the buffer.
        """
        assert self.size == self.max_size  # buffer has to be full before you can get

        # Reset pointers so next epoch overwrites buffers
        self.size = 0
        self.ptr = np.zeros((self.num_agents,), dtype=np.int)
        self.path_start_idx = np.zeros((self.num_agents,), dtype=np.int)

        # Concatenate agents' episodes
        obs_buf = self.obs_buf.reshape(core.combined_shape(self.num_agents * self.n_size, self.obs_dim))
        act_buf = self.act_buf.reshape(core.combined_shape(self.num_agents * self.n_size, self.act_dim))
        ret_buf = self.ret_buf.flatten()
        logp_buf = self.logp_buf.flatten()
        adv_buf = self.adv_buf.flatten()

        # the next two lines implement the advantage normalization trick
        adv_mean, adv_std = mpi_statistics_scalar(adv_buf)
        adv_buf = (adv_buf - adv_mean) / adv_std
        data = dict(obs=obs_buf, act=act_buf, ret=ret_buf,
                    adv=adv_buf, logp=logp_buf)

        # TODO: See if we are copying below if we run into memory issues
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}
Ejemplo n.º 2
0
 def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
     self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
     self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
     self.adv_buf = np.zeros(size, dtype=np.float32)
     self.rew_buf = np.zeros(size, dtype=np.float32)
     self.ret_buf = np.zeros(size, dtype=np.float32)
     self.val_buf = np.zeros(size, dtype=np.float32)
     self.logp_buf = np.zeros(size, dtype=np.float32)
     self.gamma, self.lam = gamma, lam
     self.ptr, self.path_start_idx, self.max_size = 0, 0, size
Ejemplo n.º 3
0
    def __init__(self,
                 obs_dim,
                 act_dim,
                 size,
                 gamma=0.99,
                 lam=0.95,
                 do_masking=False,
                 num_cands=37):
        self.do_masking = do_masking
        self.possible_pivot_size = num_cands

        self.obs_buf = (np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32),np.zeros(core.combined_shape(size, self.possible_pivot_size), dtype=np.float32)) \
                if self.do_masking else np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(size, 1), dtype=np.float32)
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size
Ejemplo n.º 4
0
    def __init__(self, obs_dim, act_dim, max_size, gamma=0.99, lam=0.95,
                 num_agents=1, shift_advs_pct=0.0):
        print(f'Max buffer size {max_size} - # agents {num_agents}')
        assert max_size % num_agents == 0
        n_size = max_size // num_agents

        self.obs_buf = np.zeros(core.combined_shape(num_agents, core.combined_shape(n_size, obs_dim)), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(num_agents, core.combined_shape(n_size, act_dim)), dtype=np.float32)
        self.adv_buf = np.zeros((num_agents, n_size), dtype=np.float32)
        self.rew_buf = np.zeros((num_agents, n_size), dtype=np.float32)
        self.ret_buf = np.zeros((num_agents, n_size), dtype=np.float32)
        self.val_buf = np.zeros((num_agents, n_size), dtype=np.float32)
        self.logp_buf = np.zeros((num_agents, n_size), dtype=np.float32)
        self.gamma = gamma
        self.lam = lam
        self.num_agents = num_agents
        self.ptr = np.zeros((num_agents,), dtype=np.int)
        self.path_start_idx = np.zeros((num_agents,), dtype=np.int)
        self.size = 0
        self.max_size = max_size
        self.n_size = n_size
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.shift_advs_pct = shift_advs_pct