def __init__(self, obs_dim, act_dim, size): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.done_buf = np.zeros(size, dtype=np.float32) self.ptr, self.size, self.max_size = 0, 0, size
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def reset(self): self.obs_buf = np.zeros(core.combined_shape(self.buffer_size, self.obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(self.buffer_size, self.act_dim), dtype=np.float32) self.adv_buf = np.zeros(self.buffer_size, dtype=np.float32) self.rew_buf = np.zeros(self.buffer_size, dtype=np.float32) self.ret_buf = np.zeros(self.buffer_size, dtype=np.float32) self.val_buf = np.zeros(self.buffer_size, dtype=np.float32) self.logp_buf = np.zeros(self.buffer_size, dtype=np.float32) self.seq_len_buf = np.zeros(self.batch_size, dtype=np.int32) self.ptr, self.path_start_idx = 0, 0
def __init__(self, obs_dim, act_dim, buffer_size, batch_size, gamma=0.99, lam=0.95): self.obs_dim = obs_dim self.act_dim = act_dim self.buffer_size = buffer_size self.batch_size = batch_size self.obs_buf = np.zeros(core.combined_shape(buffer_size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(buffer_size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(buffer_size, dtype=np.float32) self.rew_buf = np.zeros(buffer_size, dtype=np.float32) self.ret_buf = np.zeros(buffer_size, dtype=np.float32) self.val_buf = np.zeros(buffer_size, dtype=np.float32) self.logp_buf = np.zeros(buffer_size, dtype=np.float32) self.seq_len_buf = np.zeros(batch_size, dtype=np.int32) self.gamma, self.lam = gamma, lam self.ptr, self.path_start_idx, self.max_size = 0, 0, buffer_size
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): # size 是批量大小. 这些变量用于在智能体与环境交互过程中保存记忆 self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) # advantage, 使用 GAE 计算 self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros( size, dtype=np.float32) # target-value, critic使用的target self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) # 计算 GAE 使用的两个参数, gamma 和 lambda self.gamma, self.lam = gamma, lam # ptr 代表当前时间步, path_start_idx 代表初始时间步 self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def __init__(self, obs_dim, act_dim, size): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.done_buf = np.zeros(size, dtype=np.float32) self.ptr, self.size, self.max_size = 0, 0, size #adding extra initializations for normalization self.obs_buf_max = np.zeros((self.obs_buf.shape[1], 1), dtype=np.float32) self.obs2_buf_max = np.zeros((self.obs2_buf.shape[1], 1), dtype=np.float32) self.rew_buf_max = np.zeros((1, 1), dtype=np.float32) self.obs_buf_min = np.zeros((self.obs_buf.shape[1], 1), dtype=np.float32) self.obs2_buf_min = np.zeros((self.obs2_buf.shape[1], 1), dtype=np.float32) self.rew_buf_min = np.zeros((1, 1), dtype=np.float32)