class MeanStdNormalizer(BaseNormalizer): def __init__(self, read_only=False, clip=10.0, epsilon=1e-8): BaseNormalizer.__init__(self, read_only) self.read_only = read_only self.rms = None self.clip = clip self.epsilon = epsilon def __call__(self, x): from baselines.common.running_mean_std import RunningMeanStd x = np.asarray(x) if self.rms is None: self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:]) if not self.read_only: self.rms.update(x) return np.clip( (x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon), -self.clip, self.clip) def state_dict(self): return {'mean': self.rms.mean, 'var': self.rms.var} def load_state_dict(self, saved): self.rms.mean = saved['mean'] self.rms.var = saved['var']
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs)
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=0): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs) @property def n_active_envs(self): return self.venv.n_active_envs def set_active_envs(self, active_idx): self.venv.set_active_envs(active_idx)
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None # self.ret_rms = RunningMeanStd(shape=()) if ret else None self.ret_rms = None # reward shouldn't normalize without reset self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): """ Reset all environments """ obs = self.venv.reset() return self._obfilt(obs)
class bVecNormalize(VecEnv): def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnv.__init__(self, observation_space=venv.observation_space, action_space=venv.action_space) print('bullet vec normalize 초기화 입니다. ') self.venv = venv self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(1) # TODO, self.num_envs self.gamma = gamma self.epsilon = epsilon def step(self, action): return self.step_norm(action) def step_norm(self, action): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step(action) # 각 robot에서 정의된 step()이 호출됨 self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: # TODO, ret_rms가 정의되어 있지 않으면 enjoy모드로 간주하여 update안함 self.ob_rms.update(obs) if self.ret_rms else None obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): obs = self.venv.reset() return self._obfilt(obs) def set_target(self, target_pos): self.venv.set_target(target_pos) def get_state(self): return self.venv.get_state()
class VecNormalize(VecEnvWrapper): def __init__(self, venv, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): """ A rolling average, normalizing, vectorized wrapepr for environment base class :param venv: ([Gym Environment]) the list of environments to vectorize and normalize :param norm_obs: (bool) normalize observation :param norm_reward: (bool) normalize reward with discounting (r = sum(r_old) * gamma + r_new) :param clip_obs: (float) clipping value for nomalizing observation :param clip_reward: (float) clipping value for nomalizing reward :param gamma: (float) discount factor :param epsilon: (float) epsilon value to avoid arithmetic issues """ VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if norm_obs else None self.ret_rms = RunningMeanStd(shape=()) if norm_reward else None self.clip_obs = clip_obs self.clip_reward = clip_reward self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rewards, dones, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rewards obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rewards = np.clip( rewards / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) return obs, rewards, dones, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs else: return obs def reset(self): obs = self.venv.reset() return self._obfilt(obs)
class VecNormalize(VecEnvWrapper): def __init__(self, venv, visual_obs=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.spaces['visual'].shape ) if visual_obs else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = True def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs['visual'] = self._obfilt(obs['visual']) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs, update=True): if self.ob_rms: if self.training and update: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() obs['visual'] = self._obfilt(obs['visual']) return obs def train(self): self.training = True def eval(self): self.training = False
class VecNormalizeRewards(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, cliprew=10., gamma=0.99, epsilon=1e-8, eval=False): VecEnvWrapper.__init__(self, venv) self.ret_rms = RunningMeanStd(shape=()) self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.eval = eval def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews if self.ret_rms: if not self.eval: self.ret_rms.update(self.ret) rews = np.clip((rews - self.ret_rms.mean) / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def reset(self): print("Env resetting!!!!!!!!!!!!!!") self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return obs def save(self, loc): s = {} if self.ret_rms: s['ret_rms'] = self.ret_rms import pickle with open(loc + '.env_stat.pkl', 'wb') as f: pickle.dump(s, f) def load(self, loc): import pickle with open(loc + '.env_stat.pkl', 'rb') as f: s = pickle.load(f) if self.ret_rms: self.ret_rms = s['ret_rms']
def test_runningmeanstd(): for (x1, x2, x3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), ]: rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) x = np.concatenate([x1, x2, x3], axis=0) ms1 = [x.mean(axis=0), x.var(axis=0)] rms.update(x1) rms.update(x2) rms.update(x3) ms2 = [rms.mean, rms.var] assert np.allclose(ms1, ms2)
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__( self, venv, ob=False, ret=False, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8 ): # Akhil: add running mean and variance here so the correct mean and var can be inputted here when a model is loaded! VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): obs = self.venv.reset() return self._obfilt(obs)
class MeanStdNormalizer: def __init__(self, read_only=False, clip=10.0, epsilon=1e-8): self.read_only = read_only self.rms = None self.clip = clip self.epsilon = epsilon def __call__(self, x): x = np.asarray(x) if self.rms is None: self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:]) if not self.read_only: self.rms.update(x) return np.clip((x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon), -self.clip, self.clip)
def test_runningmeanstd(): """Test RunningMeanStd object""" for (x_1, x_2, x_3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)) ]: rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) x_cat = np.concatenate([x_1, x_2, x_3], axis=0) moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)] rms.update(x_1) rms.update(x_2) rms.update(x_3) moments_2 = [rms.mean, rms.var] assert np.allclose(moments_1, moments_2)
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma * (1 - news) + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: tmp = copy.deepcopy(obs) self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) for i in range(len(tmp)): obs[i][-6:] = tmp[i][-6:] return obs else: return obs def reset(self): """ Reset all environments """ obs = self.venv.reset() return self._obfilt(obs)
class Normalize(gym.Wrapper): """ A wrapper that normalizes the observations and returns from an environment. """ def __init__(self, env, clip_ob=10, clip_rew=10, epsilon=1e-8, gamma=0.99): super().__init__(env) self.clip_ob = clip_ob self.clip_rew = clip_rew self._reset_rew() self.gamma = gamma self.epsilon = epsilon self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) def step(self, action): obs, rew, done, misc = self.env.step(action) self.ret = self.ret * self.gamma + rew self.ret_rms.update(self.ret) rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_rew, self.clip_rew) if done: self._reset_rew() obs = self._ob_filter(obs) return obs, rew, done, misc def reset(self): self._reset_rew() obs = self.env.reset() return self._ob_filter(obs) def _ob_filter(self, obs): self.ob_rms.update(obs) obs = np.clip( (obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clip_ob, self.clip_ob) return obs def _reset_rew(self): self.ret = np.zeros((1, ), dtype=np.float32)
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs)
class Discriminator(nn.Module): def __init__(self, state_dim, action_dim, user_dim, device, lr): super(Discriminator, self).__init__() self.device = device self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.label_embedding = nn.Embedding(10, 10) self.prefc1 = nn.Linear(user_dim, 25) self.linear = nn.Linear(state_dim * 6 + action_dim, 81) self.relu = nn.LeakyReLU(0.2, inplace=True) self.conv1 = nn.Conv2d(1, 2, 3) self.pool = nn.MaxPool2d(2, 1) self.conv2 = nn.Conv2d(2, 20, 3) self.conv2_bn = nn.BatchNorm2d(20) self.fc1 = nn.Linear(180, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 1) self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) def forward(self, state, user, label): label = label.view(label.size(0)).long() user = self.prefc1(user).view(user.size(0), -1) x = state.view(state.size(0), -1) x = torch.cat((state, user), dim=1).view(state.size(0), -1) x = torch.cat((x.view(x.size(0), -1), self.label_embedding(label)), dim=1) x = self.relu(self.linear(x)) x = x.view(x.size(0), 1, 9, 9) x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2_bn(self.conv2(x)))) x = x.view(-1, 180) x = F.leaky_relu(self.fc1(x), 0.2) x = F.leaky_relu(self.fc2(x), 0.2) x = self.fc3(x) return torch.sigmoid(x) def update(self, expert_loader, rollouts): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_user, policy_action = policy_batch[ 0], policy_batch[1], policy_batch[2] policy_d = self.forward(policy_state, policy_user, policy_action.float()) expert_state, expert_user, expert_action = expert_batch expert_state = expert_state.float().to(self.device) expert_user = expert_user.view( (expert_user.shape[0], -1)).float().to(self.device) expert_action = expert_action.view( (expert_state.shape[0], -1)).float().to(self.device) expert_d = self.forward(expert_state, expert_user, expert_action) expert_loss = F.binary_cross_entropy_with_logits( expert_d, torch.ones(expert_d.size()).to(self.device)) policy_loss = F.binary_cross_entropy_with_logits( policy_d, torch.zeros(policy_d.size()).to(self.device)) gail_loss = expert_loss + policy_loss loss += gail_loss.item() n += 1 self.optimizer.zero_grad() gail_loss.backward() self.optimizer.step() return loss / n def predict_reward(self, state, user, action, gamma, update_rms=True): with torch.no_grad(): self.eval() d = self.forward(state, user, action.float()) s = torch.sigmoid(d) reward = s.log() - (1 - s).log() if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def step_wait_collisions(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, collisions, infos = self.venv.step_wait_collisions() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, collisions, infos def step_wait_runtime(self): obs, rews, news, infos = self.venv.step_wait_runtime() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def _obfilt_run(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip( self.ob_rms.mean + obs * np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs)
class Discriminator(nn.Module): def __init__(self, input_dim, hidden_dim, device, reward_type, update_rms, cliprew_down=-10.0, cliprew_up=10.0): super(Discriminator, self).__init__() self.cliprew_down = cliprew_down self.cliprew_up = cliprew_up self.device = device self.reward_type = reward_type self.update_rms = update_rms # self.trunk = nn.Sequential( # nn.Linear(input_dim, hidden_dim), nn.Tanh(), # nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), # nn.Linear(hidden_dim, 1), nn.Tanh()).to(device) self.trunk = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) def compute_grad_pen(self, expert_state, expert_action, policy_state, policy_action, lambda_=10): alpha = torch.rand(expert_state.size(0), 1) expert_data = torch.cat([expert_state, expert_action], dim=1) policy_data = torch.cat([policy_state, policy_action], dim=1) alpha = alpha.expand_as(expert_data).to(expert_data.device) mixup_data = alpha * expert_data + (1 - alpha) * policy_data mixup_data.requires_grad = True disc = self.trunk(mixup_data) ones = torch.ones(disc.size()).to(disc.device) grad = autograd.grad( outputs=disc, inputs=mixup_data, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() return grad_pen def update_zm(self, replay_buf, expert_buf, obsfilt=None, batch_size=128): self.train() obs = replay_buf.obs obs_batch = obs[:-1].view(-1, *obs.size()[2:]) states = obs_batch.cpu().detach().numpy() # states = np.concatenate(states,axis=1) actions = replay_buf.actions actions_batch = actions.view(-1,actions.size(-1)) actions = actions_batch.cpu().detach().numpy() policy_buf = Dset(inputs=states[0:len(actions)], labels=actions, randomize=True) loss = 0 g_loss =0.0 gp =0.0 n = 0 # loss = 0 # Sample replay buffer policy_state, policy_action = policy_buf.get_next_batch(batch_size) policy_state = torch.FloatTensor(policy_state).to(self.device) policy_action = torch.FloatTensor(policy_action).to(self.device) temp=[policy_state, policy_action] policy_d = self.trunk(torch.cat([policy_state, policy_action], dim=1)) # Sample expert buffer expert_state, expert_action = expert_buf.get_next_batch(batch_size) expert_state = obsfilt(expert_state, update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = torch.FloatTensor(expert_action).to(self.device) expert_d = self.trunk(torch.cat([expert_state, expert_action], dim=1)) # expert_loss = F.binary_cross_entropy_with_logits( # expert_d, # torch.ones(expert_d.size()).to(self.device)) # policy_loss = F.binary_cross_entropy_with_logits( # policy_d, # torch.zeros(policy_d.size()).to(self.device)) # expert_loss = torch.mean(expert_d).to(self.device) # policy_loss = torch.mean(policy_d).to(self.device) expert_loss = torch.mean(torch.tanh(expert_d)).to(self.device) policy_loss = torch.mean(torch.tanh(policy_d)).to(self.device) # gail_loss = expert_loss + policy_loss wd = expert_loss - policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) # loss += (gail_loss + grad_pen).item() loss += (-wd + grad_pen).item() g_loss += (wd).item() gp += (grad_pen).item() n += 1 self.optimizer.zero_grad() # (gail_loss + grad_pen).backward() (-wd + grad_pen).backward() self.optimizer.step() return g_loss/n, gp/n, 0.0, loss / n def update(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 g_loss =0.0 gp =0.0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] policy_d = self.trunk( torch.cat([policy_state, policy_action], dim=1)) expert_state, expert_action = expert_batch expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk( torch.cat([expert_state, expert_action], dim=1)) # expert_loss = F.binary_cross_entropy_with_logits( # expert_d, # torch.ones(expert_d.size()).to(self.device)) # policy_loss = F.binary_cross_entropy_with_logits( # policy_d, # torch.zeros(policy_d.size()).to(self.device)) # expert_loss = torch.mean(expert_d).to(self.device) # policy_loss = torch.mean(policy_d).to(self.device) expert_loss = torch.mean(torch.tanh(expert_d)).to(self.device) policy_loss = torch.mean(torch.tanh(policy_d)).to(self.device) # gail_loss = expert_loss + policy_loss wd = expert_loss - policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) # loss += (gail_loss + grad_pen).item() loss += (-wd + grad_pen).item() g_loss += (wd).item() gp += (grad_pen).item() n += 1 self.optimizer.zero_grad() # (gail_loss + grad_pen).backward() (-wd + grad_pen).backward() self.optimizer.step() return g_loss/n, gp/n, 0.0, loss / n def update_origin(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 g_loss =0.0 gp =0.0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] policy_d = self.trunk( torch.cat([policy_state, policy_action], dim=1)) expert_state, expert_action = expert_batch expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk( torch.cat([expert_state, expert_action], dim=1)) # expert_loss = F.binary_cross_entropy_with_logits( # expert_d, # torch.ones(expert_d.size()).to(self.device)) # policy_loss = F.binary_cross_entropy_with_logits( # policy_d, # torch.zeros(policy_d.size()).to(self.device)) expert_loss = torch.mean(expert_d).to(self.device) policy_loss = torch.mean(policy_d).to(self.device) # gail_loss = expert_loss + policy_loss wd = expert_loss - policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) # loss += (gail_loss + grad_pen).item() loss += (-wd + grad_pen).item() g_loss += (wd).item() gp += (grad_pen).item() n += 1 self.optimizer.zero_grad() # (gail_loss + grad_pen).backward() (-wd + grad_pen).backward() self.optimizer.step() return g_loss/n, gp/n, 0.0, loss / n def update_zm_origin(self, replay_buf, expert_buf, obsfilt=None, batch_size=128): self.train() obs = replay_buf.obs obs_batch = obs[:-1].view(-1, *obs.size()[2:]) states = obs_batch.cpu().detach().numpy() # states = np.concatenate(states,axis=1) actions = replay_buf.actions actions_batch = actions.view(-1,actions.size(-1)) actions = actions_batch.cpu().detach().numpy() policy_buf = Dset(inputs=states[0:len(actions)], labels=actions, randomize=True) # loss = 0 # Sample replay buffer policy_state, policy_action = policy_buf.get_next_batch(batch_size) policy_state = torch.FloatTensor(policy_state).to(self.device) policy_action = torch.FloatTensor(policy_action).to(self.device) temp=[policy_state, policy_action] policy_d = self.trunk(torch.cat([policy_state, policy_action], dim=1)) # Sample expert buffer expert_state, expert_action = expert_buf.get_next_batch(batch_size) expert_state = obsfilt(expert_state, update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = torch.FloatTensor(expert_action).to(self.device) expert_d = self.trunk(torch.cat([expert_state, expert_action], dim=1)) expert_loss = F.binary_cross_entropy_with_logits( expert_d, torch.ones(expert_d.size()).to(self.device)) policy_loss = F.binary_cross_entropy_with_logits( policy_d, torch.zeros(policy_d.size()).to(self.device)) gail_loss = expert_loss + policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) # print("gail_loss = %s, gp=%s" % (gail_loss.item(), grad_pen.item())) loss = (gail_loss + grad_pen).item() # loss = (gail_loss).item() self.optimizer.zero_grad() (gail_loss + grad_pen).backward() # (gail_loss).backward() self.optimizer.step() return gail_loss.item(), grad_pen.item(), 0.0, loss def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) if self.reward_type == 0: s = torch.exp(d) reward = s elif self.reward_type == 1: s = torch.sigmoid(d) reward = - (1 - s).log() elif self.reward_type == 2: s = torch.sigmoid(d) reward = s elif self.reward_type == 3: s = torch.sigmoid(d) reward = s.exp() elif self.reward_type == 4: reward = d elif self.reward_type == 5: s = torch.sigmoid(d) reward = s.log() - (1 - s).log() # s = torch.exp(d) # # reward = s.log() - (1 - s).log() # s = torch.sigmoid(d) # reward = s # # reward = d if self.returns is None: self.returns = reward.clone() if self.update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) else: return reward # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) # return torch.clamp(reward,self.cliprew_down, self.cliprew_up) # return reward def predict_reward_exp(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) s = torch.exp(d) # s = torch.sigmoid(d) # reward = s.log() - (1 - s).log() reward = s # reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) # return torch.clamp(reward,self.cliprew_down, self.cliprew_up) # return reward def predict_reward_t1(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) # s = torch.exp(d) s = torch.sigmoid(d) # reward = s.log() - (1 - s).log() reward = - (1 - s).log() # reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) # return torch.clamp(reward,self.cliprew_down, self.cliprew_up) return reward def predict_reward_origin(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) s = torch.exp(d) # s = torch.sigmoid(d) # reward = s.log() - (1 - s).log() # reward = - (1 - s).log() reward = s # reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up) # return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) # return torch.clamp(reward,self.cliprew_down, self.cliprew_up) return reward
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.spaces[0].shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs_tuple, rews, news, infos = self.venv.step_wait() obs_img, obs_measure = self.process_obs(obs_tuple) self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs_img) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, obs_measure, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): """ Reset all environments """ obs_tuple = self.venv.reset() obs_img, obs_measure = self.process_obs(obs_tuple) return self._obfilt(obs_img), obs_measure def process_obs(self, obs_tuple): obs_tuple = np.array(obs_tuple) obs_img = [] obs_measure = [] for i in range(obs_tuple.shape[0]): obs_img.append(obs_tuple[i][0]) obs_measure.append(obs_tuple[i][1]) return np.array(obs_img), np.array(obs_measure)
class CNNBase(NNBase): def __init__(self, num_inputs, input_size, action_space, hidden_size=512, embed_size=0, recurrent=False, device='cpu'): super(CNNBase, self).__init__(recurrent, num_inputs, hidden_size, embed_size) self.device = device self.action_space = action_space h, w = input_size self.conv1 = nn.Conv2d(num_inputs, 32, kernel_size=8, stride=4) w_out = conv2d_size_out(w, kernel_size=8, stride=4) h_out = conv2d_size_out(h, kernel_size=8, stride=4) self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) w_out = conv2d_size_out(w_out, kernel_size=4, stride=2) h_out = conv2d_size_out(h_out, kernel_size=4, stride=2) self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1) w_out = conv2d_size_out(w_out, kernel_size=3, stride=1) h_out = conv2d_size_out(h_out, kernel_size=3, stride=1) init_cnn_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) self.cnn_trunk = nn.Sequential( init_cnn_(self.conv1), nn.ReLU(), init_cnn_(self.conv2), nn.ReLU(), init_cnn_(self.conv3), nn.ReLU(), Flatten(), init_cnn_(nn.Linear(32 * h_out * w_out, hidden_size)), nn.ReLU()) init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__( nn.Linear(hidden_size + self.action_space.n + embed_size, hidden_size // 2)), nn.Tanh(), init__(nn.Linear(hidden_size // 2, hidden_size // 2)), nn.Tanh(), init__(nn.Linear(hidden_size // 2, 1))) # self.optimizer = torch.optim.Adam(self.parameters(), lr=3e-5) self.optimizer = torch.optim.RMSprop( self.parameters(), lr=5e-5 ) # To be conistent with the wgan optimizer, althougt not necessary self.returns = None self.ret_rms = RunningMeanStd(shape=()) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def update(self, expert_loader, rollouts, discr_queue, max_grad_norm, obsfilt, i_iter=0): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action, correlated_embeddings = policy_batch[ 0], policy_batch[2], policy_batch[3] loss = torch.tensor(0.0).to(device) # Iterate through strategies in the queue. # Note: we loaded the parameters in order, therefore we ended up with latest param, which optimizer update. if len(discr_queue) < 1: return copy.deepcopy(self.state_dict()) for strategy in discr_queue: self.load_state_dict(strategy) policy_state_embedding = self.cnn_trunk(policy_state / 255.0) policy_d = self.trunk( torch.cat([ policy_state_embedding, torch.nn.functional.one_hot( policy_action, self.action_space.n).squeeze(1).float(), correlated_embeddings ], dim=1)) expert_state, expert_action = expert_batch expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_state_embedding = self.cnn_trunk(expert_state / 255.) expert_d = self.trunk( torch.cat([ expert_state_embedding, expert_action, correlated_embeddings ], dim=1)) expert_loss = -expert_d.mean() policy_loss = policy_d.mean() loss = loss + expert_loss + policy_loss loss = loss / len(discr_queue) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.parameters(), max_grad_norm) self.optimizer.step() return copy.deepcopy(self.state_dict()) def predict_strategy_reward(self, state, action, embedding, gamma, masks, update_rms): with torch.no_grad(): self.eval() state_embedding = self.cnn_trunk(state / 255.) d = self.trunk( torch.cat([ state_embedding, torch.nn.functional.one_hot( action, self.action_space.n).squeeze(1).float(), embedding ], dim=1)) reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) def predict_reward(self, state, action, embedding, gamma, masks, discr_queue, update_rms=True): """ :param state: :param action: :param gamma: :param masks: :param discrim_queue: :param update_rms: :return: returns actor_reward """ actor_reward = gains = 0.0 strategy_rewards = [] if len(discr_queue) > 0: for strategy in discr_queue: self.load_state_dict(strategy) reward = self.predict_strategy_reward(state, action, embedding, gamma, masks, update_rms) strategy_rewards.append(reward) # gain gets used by correlator to compute maxEnt corEQ loss. # It quantifies, how much overall gain would be achieved by switching strategies. for i in range(len(strategy_rewards)): for j in range(i + 1, len(strategy_rewards)): gains = gains - torch.pow( strategy_rewards[i] - strategy_rewards[j], 2) gains = gains / (len(discr_queue) * len(discr_queue) / 4) actor_reward = strategy_rewards[-1] return actor_reward, gains
class MLPBase(NNBase): def __init__(self, num_inputs, input_size, action_space, hidden_size=64, embed_size=0, recurrent=False, device='cpu'): super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size, embed_size) self.device = device if recurrent: num_inputs = hidden_size init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__( nn.Linear(num_inputs + action_space.shape[0] + embed_size, hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, 1))) # self.optimizer = torch.optim.Adam(self.parameters(), lr= 3e-5) self.optimizer = torch.optim.RMSprop(self.parameters(), lr=5e-5) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.train() def update(self, expert_loader, rollouts, discr_queue, max_grad_norm, obsfilt, i_iter=0): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.train() for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action, embeddings = policy_batch[ 0], policy_batch[2], policy_batch[3] loss = torch.tensor(0.0).to(device) # Iterate through strategies in the queue. # Note: we loaded the parameters in order, therefore we end up with latest param, which optimizer updates. if len(discr_queue) < 1: return copy.deepcopy(self.state_dict()) for strategy in discr_queue: self.load_state_dict(strategy) policy_d = self.trunk( torch.cat([policy_state, policy_action, embeddings], dim=1)) expert_state, expert_action = expert_batch expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk( torch.cat([expert_state, expert_action, embeddings], dim=1)) expert_loss = -expert_d.mean() policy_loss = policy_d.mean() loss = loss + expert_loss + policy_loss loss = loss / len(discr_queue) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_( self.parameters(), max_grad_norm ) # not necessary but it is conistently used across all NN modeules. self.optimizer.step() return copy.deepcopy(self.state_dict()) def predict_strategy_reward(self, state, action, embedding, gamma, masks, update_rms): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action, embedding], dim=1)) reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) def predict_reward(self, state, action, embedding, gamma, masks, discr_queue, update_rms=True): """ :param state: :param action: :param gamma: :param masks: :param discrim_queue: :param update_rms: :return: returns actor_reward """ actor_reward = gains = 0.0 strategy_rewards = [] if len(discr_queue) > 0: for strategy in discr_queue: self.load_state_dict(strategy) reward = self.predict_strategy_reward(state, action, embedding, gamma, masks, update_rms) strategy_rewards.append(reward) # gain gets used by correlator to compute maxEnt corEQ loss. # It quantifies, how much overall gain would be achieved by switching strategies. for i in range(len(strategy_rewards)): for j in range(i + 1, len(strategy_rewards)): gains = gains - torch.pow( strategy_rewards[i] - strategy_rewards[j], 2) gains = gains / (len(discr_queue) * len(discr_queue) / 4) actor_reward = strategy_rewards[-1] return actor_reward, gains
class Discriminator(nn.Module): """ Modified GAIL Discriminator to handle graph state, and composite actions """ def __init__(self, input_dim, hidden_dim, device): super(Discriminator, self).__init__() self.device = device self.encoder = WoBObservationEncoder(out_dim=hidden_dim) self.trunk_fn = nn.Sequential(nn.Linear(hidden_dim, 1)) self.train() self.to(device) self.optimizer = torch.optim.Adam(self.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) def forward(self, inputs, votes): x = self.encoder(inputs, votes) return self.trunk_fn(x) def trunk(self, state, action): batch_size = state.batch.max().item() + 1 votes = torch.zeros(state.value.shape[0], 1) past = 0 for b_idx in range(batch_size): _m = state.batch == b_idx _size = _m.sum().item() votes[past + action[b_idx], 0] = 1 past += _size return self.forward(state, votes) def compute_grad_pen(self, expert_state, expert_action, policy_state, policy_action, lambda_=10): # merge graphs, apply alpha to vote shares mixup_state = Batch() for key, value in expert_state: assert isinstance(key, str), str(key) if key in ("edge_index", "edge_attr"): continue mixup_state[key] = torch.cat( [expert_state[key], policy_state[key]]) mixup_state.edge_index = torch.cat( [ expert_state.edge_index, policy_state.edge_index + expert_state.batch.shape[0], ], dim=1, ) alpha = torch.rand(expert_action.size(0)) batch_size = expert_state.batch.max().item() + 1 mixup_votes = [] for i in range(batch_size): _em = expert_state.batch == i _pm = policy_state.batch == i votes = torch.zeros((_em.sum() + _pm.sum()).item()) assert votes.shape[0] votes[expert_action[i]] = alpha[i] votes[policy_action[i] + _em.sum().item()] = 1 - alpha[i] mixup_votes.append(votes) mixup_action = torch.cat(mixup_votes).view(-1, 1) mixup_action.requires_grad = True disc = self.forward(mixup_state, mixup_action) ones = torch.ones(disc.size()).to(disc.device) inputs = [mixup_action] for key, value in mixup_state: if value.dtype == torch.float: value.requires_grad = True inputs.append(value) grad = autograd.grad( outputs=disc, inputs=inputs, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True, allow_unused=True, )[0] grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() return grad_pen def update(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) assert len(expert_loader) loss = 0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] batch_size = policy_state.batch.max().item() + 1 policy_d = self.trunk(policy_state, policy_action) expert_state, expert_action, _ = expert_batch # expert_state = obsfilt(expert_state.numpy(), update=False) # expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk(expert_state, expert_action) expert_loss = F.binary_cross_entropy_with_logits( expert_d, torch.ones(expert_d.size()).to(self.device)) policy_loss = F.binary_cross_entropy_with_logits( policy_d, torch.zeros(policy_d.size()).to(self.device)) gail_loss = expert_loss + policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) loss += (gail_loss + grad_pen).item() n += 1 self.optimizer.zero_grad() (gail_loss + grad_pen).backward() self.optimizer.step() return loss / n def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(state, action) s = torch.sigmoid(d) reward = s.log() - (1 - s).log() if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=False, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.hier = self.venv.hier if self.hier: obs_space = self.observation_space.spaces[1] self.ob_rms = RunningMeanStd(shape=obs_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None else: self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews if self.hier: tokens, obs = obs obs = self._obfilt(obs) obs = (tokens, obs) else: obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def goal(self, obs): return self.venv.goal(obs) def action(self, obs): return self.venv.action(obs) def final_obs(self): return self.venv.obs_from_buf_final() def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def tf_filt(self, obs_tf): if self.ob_rms: obs_tf = tf.clip_by_value((obs_tf - self.ob_rms.mean) / \ tf.cast(np.sqrt(self.ob_rms.var + self.epsilon), tf.float32), -self.clipob, self.clipob) return obs_tf else: return obs_tf def reset(self): """ Reset all environments """ obs = self.venv.reset() if self.hier: return obs[0], self._obfilt(obs[1]) else: return self._obfilt(obs)
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) if isinstance(self.observation_space, Dict): self.ob_rms = {} for key in self.observation_space.spaces.keys(): self.ob_rms[key] = RunningMeanStd( shape=self.observation_space.spaces[key].shape ) if ob else None else: self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): def _obfilt(obs, ob_rms): if ob_rms: ob_rms.update(obs) obs = np.clip( (obs - ob_rms.mean) / np.sqrt(ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs if isinstance(self.ob_rms, dict): for key in self.ob_rms: obs[key] = _obfilt(obs[key], self.ob_rms[key]) else: obs = _obfilt(obs, self.ob_rms) return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs) def save_state(self, save_path): """ pickle and save the normalization state variables """ state = {'ob_rms': self.ob_rms, 'ret_rms': self.ret_rms} with open(save_path, 'wb') as f: pickle.dump(state, f) def restore_state(self, load_path): """ unpickle and restore the normalization state variables """ with open(load_path, 'rb') as f: state = pickle.load(f) self.ob_rms = state['ob_rms'] self.ret_rms = state['ret_rms'] def get_obs(self): return self._obfilt(self.venv.get_obs())
class Discriminator(nn.Module): def __init__(self, input_dim, hidden_dim, device): super(Discriminator, self).__init__() self.device = device self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) def compute_grad_pen(self, expert_state, expert_action, policy_state, policy_action, lambda_=10): alpha = torch.rand(expert_state.size(0), 1) expert_data = torch.cat([expert_state, expert_action], dim=1) policy_data = torch.cat([policy_state, policy_action], dim=1) alpha = alpha.expand_as(expert_data).to(expert_data.device) mixup_data = alpha * expert_data + (1 - alpha) * policy_data mixup_data.requires_grad = True disc = self.trunk(mixup_data) ones = torch.ones(disc.size()).to(disc.device) grad = autograd.grad(outputs=disc, inputs=mixup_data, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() return grad_pen def update(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] policy_d = self.trunk( torch.cat([policy_state, policy_action], dim=1)) expert_state, expert_action = expert_batch expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk( torch.cat([expert_state, expert_action], dim=1)) expert_loss = F.binary_cross_entropy_with_logits( expert_d, torch.ones(expert_d.size()).to(self.device)) policy_loss = F.binary_cross_entropy_with_logits( policy_d, torch.zeros(policy_d.size()).to(self.device)) gail_loss = expert_loss + policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) loss += (gail_loss + grad_pen).item() n += 1 self.optimizer.zero_grad() (gail_loss + grad_pen).backward() self.optimizer.step() return loss / n def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) s = torch.sigmoid(d) reward = s.log() - (1 - s).log() if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
class AIL(): def __init__(self, observation_space, action_space, device, args, log_only=False): super(AIL, self).__init__() if log_only: self.m_return_list = self.load_expert_data(args) return self.lr = args.il_lr # larger learning rate for MLP self.action_dim = action_space.shape[0] self.hidden_dim = 100 self.state_dim = observation_space.shape[0] # self.device = device self.create_networks() self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.gail_batch_size = args.gail_batch_size self.label_expert = 1 self.label_policy = -1 self.reward_std = args.reward_std self.gp_lambda = args.gp_lambda self.m_return_list = self.make_dataset(args) if args.ail_saturate is None and args.ail_loss_type != "unhinged": args.ail_saturate = 1 if args.ail_loss_type == "logistic": self.adversarial_loss = Logistic_Loss() elif args.ail_loss_type == "unhinged": self.adversarial_loss = Unhinged_Loss() if args.ail_saturate is None: args.ail_saturate = 0 elif args.ail_loss_type == "sigmoid": self.adversarial_loss = Sigmoid_Loss() elif args.ail_loss_type == "nlogistic": self.adversarial_loss = Normalized_Logistic_Loss() elif args.ail_loss_type == "apl": self.adversarial_loss = APL_Loss() self.ail_saturate = args.ail_saturate def create_networks(self): self.trunk = Discriminator(self.state_dim + self.action_dim, self.hidden_dim).to(self.device) self.optimizer = torch.optim.Adam(self.trunk.parameters(), lr=self.lr) def make_dataset(self, args): self.load_expert_data(args) # h5py demos are loaded into tensor. expert_dataset = data_utils.TensorDataset(self.real_state_tensor, self.real_action_tensor) drop_last = len(expert_dataset) > self.gail_batch_size self.expert_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=self.gail_batch_size, shuffle=True, # important to shuffle the dataset. drop_last=drop_last) def load_expert_data(self, args, verbose=1): # also load non-expert data model_list = [1.0] if args.noise_prior != 0.0: model_list += [0.4, 0.3, 0.2, 0.1, 0.0] traj_deterministic = args.traj_deterministic demo_file_size = 10000 self.index_worker_idx = [] m_return_list = [] index_start = 0 expert_state_list, expert_action_list, expert_nstate_list, expert_reward_list, expert_mask_list, expert_id_list = [],[],[],[],[],[] traj_path = "./imitation_data/%s" % (args.env_name) for model_i in range(0, len(model_list)): m = model_list[model_i] if args.noise_type == "policy": # policy noise (sub-optimal policies) traj_filename = traj_path + ( "/%s_TRAJ-N%d_P%0.1f" % (args.env_name, demo_file_size, m)) elif args.noise_type == "action": # action noise traj_filename = traj_path + ( "/%s_TRAJ-N%d_A%0.1f" % (args.env_name, demo_file_size, m)) if traj_deterministic: traj_filename += "_det" else: traj_filename += "_sto" hf = h5py.File(traj_filename + ".h5", 'r') expert_mask = hf.get('mask_array')[:] expert_state = hf.get('obs_array')[:] # expert_nstate = hf.get('nobs_array')[:] expert_action = hf.get('act_array')[:] expert_reward = hf.get('reward_array')[:] step_num = expert_mask.shape[0] traj_num = step_num - np.sum(expert_mask) m_return = np.sum(expert_reward) / traj_num m_return_list += [m_return] expert_id = np.ones((expert_mask.shape[0], 1)) * model_i if m != 1.0 and args.noise_prior != -1.0: if args.noise_prior == 0.5: pair_num = 2000 # 10000 / 5 if args.noise_prior == 0.4: pair_num = 1500 if args.noise_prior == 0.3: pair_num = 1000 if args.noise_prior == 0.2: pair_num = 500 if args.noise_prior == 0.1: pair_num = 200 if args.demo_sub_traj: sub_num = pair_num // 50 # each sub traj has 50 sa-pairs. index = [] ## data is split into sub_num chunks, and we randomly sample 50 pairs from each chunk. chuck_size = demo_file_size // sub_num indexes_start = np.random.randint(0, chuck_size - 50, size=sub_num) for i in range(0, sub_num): ii = indexes_start[i] + (i * chuck_size) index.append(np.arange(ii, ii + 50)) index = np.hstack(index) else: index = np.random.permutation(demo_file_size)[:pair_num] expert_mask = expert_mask[index] expert_state = expert_state[index] expert_action = expert_action[index] # expert_nstate = expert_nstate[index] #next state is not used. expert_reward = expert_reward[index] expert_id = expert_id[index] self.index_worker_idx += [ index_start + np.arange(0, expert_mask.shape[0]) ] index_start += expert_mask.shape[0] expert_mask_list += [expert_mask] expert_state_list += [expert_state] expert_action_list += [expert_action] # expert_nstate_list += [expert_nstate] expert_reward_list += [expert_reward] expert_id_list += [expert_id] if verbose: print("%s TRAJ is loaded from %s with full_size %s: using data size %s steps and average return %s" % \ (colored(args.noise_type, p_color), colored(traj_filename, p_color), colored(step_num, p_color), colored(expert_state.shape[0] , p_color), \ colored( "%.2f" % (m_return), p_color ))) expert_masks = np.concatenate(expert_mask_list, axis=0) expert_states = np.concatenate(expert_state_list, axis=0) expert_actions = np.concatenate(expert_action_list, axis=0) # expert_nstates = np.concatenate(expert_nstate_list, axis=0) expert_rewards = np.concatenate(expert_reward_list, axis=0) expert_ids = np.concatenate(expert_id_list, axis=0) self.real_mask_tensor = torch.FloatTensor(expert_masks).to(device_cpu) self.real_state_tensor = torch.FloatTensor(expert_states).to( device_cpu) self.real_action_tensor = torch.FloatTensor(expert_actions).to( device_cpu) # self.real_nstate_tensor = torch.FloatTensor(expert_nstates).to(device_cpu) self.real_id_tensor = torch.LongTensor(expert_ids).to(device_cpu) self.data_size = self.real_state_tensor.size(0) self.worker_num = torch.unique(self.real_id_tensor).size(0) print(self.real_state_tensor.size()) print(self.real_action_tensor.size()) if verbose: print("Total data pairs: %s, state dim %s, action dim %s" % \ (colored(self.real_state_tensor.size(0), p_color), \ colored(self.real_state_tensor.size(1), p_color), colored(self.real_action_tensor.size(1), p_color) )) return m_return_list def compute_grad_pen(self, expert_data, policy_data, lambda_=10, network=None): if expert_data.size(0) != policy_data.size(0): if expert_data.size(0) < policy_data.size(0): idx = np.random.permutation( policy_data.size(0))[:expert_data.size(0)] policy_data = policy_data[idx, :] else: idx = np.random.permutation( expert_data.size(0))[:policy_data.size(0)] expert_data = expert_data[idx, :] # # DRAGAN # alpha = torch.rand(expert_data.size()).to(expert_data.device) # mixup_data = alpha * expert_data + ((1 - alpha) * (expert_data + 0.5 * expert_data.std() * torch.rand(expert_data.size()).to(expert_data.device))) alpha = torch.rand(expert_data.size(0), 1) alpha = alpha.expand_as(expert_data).to(expert_data.device) mixup_data = alpha * expert_data + (1 - alpha) * policy_data mixup_data.requires_grad = True if network is None: network = self.trunk disc = network(mixup_data) ones = torch.ones(disc.size()).to(disc.device) grad = autograd.grad(outputs=disc, inputs=mixup_data, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() return grad_pen def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.trunk.eval() d = self.trunk.reward(sa_cat(state, action)) if self.ail_saturate == 1: reward = self.adversarial_loss.reward( d * self.label_policy, reduction=False) # saturate (positive) elif self.ail_saturate == -1: reward = -self.adversarial_loss.reward( d * self.label_expert, reduction=False) # non-saturate (negative) elif self.ail_saturate == 0: reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) if self.reward_std: return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) else: return reward def update(self, rollouts, obsfilt=None): self.trunk.train() rollouts_size = rollouts.get_batch_size() policy_mini_batch_size = self.gail_batch_size if rollouts_size > self.gail_batch_size \ else rollouts_size policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=policy_mini_batch_size) loss = 0 n = 0 for expert_batch, policy_batch in zip(self.expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] expert_state, expert_action = expert_batch[0], expert_batch[1] # need to normalize the expert data using current policy statistics so that expert and policy data have the same normalization. if obsfilt is not None: expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) policy_d = self.trunk(sa_cat(policy_state, policy_action)) expert_d = self.trunk(sa_cat(expert_state, expert_action)) grad_pen = self.compute_grad_pen( sa_cat(expert_state, expert_action), sa_cat(policy_state, policy_action), self.gp_lambda) policy_loss = self.adversarial_loss(policy_d * self.label_policy) expert_loss = self.adversarial_loss(expert_d * self.label_expert) gail_loss = expert_loss + policy_loss loss += (gail_loss + grad_pen).item() n += 1 self.optimizer.zero_grad() (gail_loss + grad_pen).backward() self.optimizer.step() return loss / n
class DRIL: def __init__(self, device=None, envs=None, ensemble_policy=None, env_name=None, expert_dataset=None, ensemble_size=None, ensemble_quantile_threshold=None, dril_bc_model=None, dril_cost_clip=None, num_dril_bc_train_epoch=None,\ training_data_split=None): self.ensemble_quantile_threshold = ensemble_quantile_threshold self.dril_cost_clip = dril_cost_clip self.device = device self.num_dril_bc_train_epoch = num_dril_bc_train_epoch self.env_name = env_name self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.observation_space = envs.observation_space if envs.action_space.__class__.__name__ == "Discrete": self.num_actions = envs.action_space.n elif envs.action_space.__class__.__name__ == "Box": self.num_actions = envs.action_space.shape[0] elif envs.action_space.__class__.__name__ == "MultiBinary": self.num_actions = envs.action_space.shape[0] self.ensemble_size = ensemble_size # use full data since we don't use a validation set self.trdata = expert_dataset.load_demo_data( 1.0, 1, self.ensemble_size)['trdata'] self.ensemble = ensemble_policy self.bc = dril_bc_model self.bc.num_batches = num_dril_bc_train_epoch self.clip_variance = self.policy_variance(envs=envs) def policy_variance(self, q=0.98, envs=None): q = self.ensemble_quantile_threshold obs = None acs = None variance = defaultdict(lambda: []) for batch_idx, batch in enumerate(self.trdata): (state, action) = batch action = action.float().to(self.device) # Image observation if len(self.observation_space.shape) == 3: state = state.repeat(self.ensemble_size, 1, 1, 1).float().to(self.device) # Feature observations else: state = state.repeat(self.ensemble_size, 1).float().to(self.device) if isinstance(envs.action_space, gym.spaces.discrete.Discrete): # Note: this is just a place holder action_idx = int(action.item()) one_hot_action = torch.FloatTensor( np.eye(self.num_actions)[int(action.item())]) action = one_hot_action elif envs.action_space.__class__.__name__ == "MultiBinary": # create unique id for each combination action_idx = int( "".join(str(int(x)) for x in action[0].tolist()), 2) else: action_idx = 0 with torch.no_grad(): ensemble_action = self.ensemble(state).squeeze() if isinstance(envs.action_space, gym.spaces.Box): action = torch.clamp(action, envs.action_space.low[0], envs.action_space.high[0]) ensemble_action = torch.clamp(ensemble_action, envs.action_space.low[0],\ envs. action_space.high[0]) cov = np.cov(ensemble_action.T.cpu().numpy()) action = action.cpu().numpy() # If the env has only one action then we need to reshape cov if envs.action_space.__class__.__name__ == "Box": if envs.action_space.shape[0] == 1: cov = cov.reshape(-1, 1) #variance.append(np.matmul(np.matmul(action, cov), action.T).item()) if isinstance(envs.action_space, gym.spaces.discrete.Discrete): for action_idx in range(envs.action_space.n): one_hot_action = torch.FloatTensor( np.eye(self.num_actions)[action_idx]) variance[action_idx].append( np.matmul(np.matmul(one_hot_action, cov), one_hot_action.T).item()) else: variance[action_idx].append( np.matmul(np.matmul(action, cov), action.T).item()) quantiles = { key: np.quantile(np.array(variance[key]), q) for key in list(variance.keys()) } if self.dril_cost_clip == '-1_to_1': return { key: lambda x: -1 if x > quantiles[key] else 1 for key in list(variance.keys()) } elif self.dril_cost_clip == 'no_clipping': return {key: lambda x: x for i in list(variance.keys())} elif self.dril_cost_clip == '-1_to_0': return { key: lambda x: -1 if x > quantiles[key] else 0 for key in list(variance.keys()) } def predict_reward(self, actions, states, envs): rewards = [] for idx in range(actions.shape[0]): # Image observation if len(self.observation_space.shape) == 3: state = states[[idx]].repeat(self.ensemble_size, 1, 1, 1).float().to(self.device) # Feature observations else: state = states[[idx]].repeat(self.ensemble_size, 1).float().to(self.device) if isinstance(envs.action_space, gym.spaces.discrete.Discrete): one_hot_action = torch.FloatTensor( np.eye(self.num_actions)[int(actions[idx].item())]) action = one_hot_action action_idx = int(actions[idx].item()) elif isinstance(envs.action_space, gym.spaces.Box): action = actions[[idx]] action_idx = 0 elif isinstance(envs.action_space, gym.spaces.MultiBinary): raise Exception('Envrionment shouldnt be MultiBinary') else: raise Exception("Unknown Action Space") with torch.no_grad(): ensemble_action = self.ensemble(state).squeeze().detach() if isinstance(envs.action_space, gym.spaces.Box): action = torch.clamp(action, envs.action_space.low[0], envs.action_space.high[0]) ensemble_action = torch.clamp(ensemble_action, envs.action_space.low[0],\ envs. action_space.high[0]) cov = np.cov(ensemble_action.T.cpu().numpy()) action = action.cpu().numpy() # If the env has only one action then we need to reshape cov if envs.action_space.__class__.__name__ == "Box": if envs.action_space.shape[0] == 1: cov = cov.reshape(-1, 1) ensemble_variance = (np.matmul(np.matmul(action, cov), action.T).item()) if action_idx in self.clip_variance: reward = self.clip_variance[action_idx](ensemble_variance) else: reward = -1 rewards.append(reward) return torch.FloatTensor(np.array(rewards)[np.newaxis].T) def normalize_reward(self, state, action, gamma, masks, reward, update_rms=True): if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) def bc_update(self): for dril_epoch in range(self.num_dril_bc_train_epoch): dril_train_loss = self.bc.update(update=True, data_loader_type='train')
class MLPBase(NNBase): def __init__(self, num_inputs, input_size, action_space, hidden_size=64, recurrent=False, device='cpu'): super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size) self.device = device if recurrent: num_inputs = hidden_size init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__(nn.Linear(num_inputs + action_space.shape[0], hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, 1))) self.optimizer = torch.optim.Adam(self.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.train() def compute_grad_pen(self, expert_state, expert_action, policy_state, policy_action, lambda_=10): alpha = torch.rand(expert_state.size(0), 1) expert_data = torch.cat([expert_state, expert_action], dim=1) policy_data = torch.cat([policy_state, policy_action], dim=1) alpha = alpha.expand_as(expert_data).to(expert_data.device) mixup_data = alpha * expert_data + (1 - alpha) * policy_data mixup_data.requires_grad = True disc = self.trunk(mixup_data) ones = torch.ones(disc.size()).to(disc.device) grad = autograd.grad(outputs=disc, inputs=mixup_data, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() return grad_pen def update(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] policy_d = self.trunk( torch.cat([policy_state, policy_action], dim=1)) expert_state, expert_action = expert_batch expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk( torch.cat([expert_state, expert_action], dim=1)) # expert_loss = F.binary_cross_entropy_with_logits( # expert_d, # torch.ones(expert_d.size()).to(self.device)) # policy_loss = F.binary_cross_entropy_with_logits( # policy_d, # torch.zeros(policy_d.size()).to(self.device)) expert_loss = -expert_d.mean() policy_loss = policy_d.mean() gail_loss = expert_loss + policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) loss += (gail_loss + grad_pen).item() n += 1 # before = list(self.parameters())[0].sum().clone() self.optimizer.zero_grad() (gail_loss + grad_pen).backward() self.optimizer.step() # after = list(self.parameters())[0].sum().clone() # print(after, before) return loss / n def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) # 0 for expert-like states, goes to -inf for non-expert-like states # compatible with envs with traj cutoffs for good (expert-like) behavior # e.g. mountain car, which gets cut off when the car reaches the destination # s = torch.sigmoid(d) # 0 for non-expert-like states, goes to +inf for expert-like states # compatible with envs with traj cutoffs for bad (non-expert-like) behavior # e.g. walking simulations that get cut off when the robot falls over # s = -(1. - torch.sigmoid(d)) # reward = s.log() - (1 - s).log() reward = d if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
class PpoOptimizer(object): envs = None def __init__(self, *, hps, scope, ob_space, ac_space, stochpol, ent_coef, gamma, gamma_ext, lam, nepochs, lr, cliprange, nminibatches, normrew, normadv, use_news, ext_coeff, int_coeff, nsteps_per_seg, nsegs_per_env, dynamics): self.dynamics = dynamics with tf.variable_scope(scope): self.hps = hps self.use_recorder = True self.n_updates = 0 self.scope = scope self.ob_space = ob_space self.ac_space = ac_space self.stochpol = stochpol self.nepochs = nepochs self.lr = lr self.cliprange = cliprange self.nsteps_per_seg = nsteps_per_seg self.nsegs_per_env = nsegs_per_env self.nminibatches = nminibatches self.gamma = gamma self.gamma_ext = gamma_ext self.lam = lam self.normrew = normrew self.normadv = normadv self.use_news = use_news self.ext_coeff = ext_coeff self.int_coeff = int_coeff self.ph_adv = tf.placeholder(tf.float32, [None, None]) self.ph_ret_int = tf.placeholder(tf.float32, [None, None]) self.ph_ret_ext = tf.placeholder(tf.float32, [None, None]) self.ph_ret = tf.placeholder(tf.float32, [None, None]) self.ph_rews = tf.placeholder(tf.float32, [None, None]) self.ph_oldnlp = tf.placeholder(tf.float32, [None, None]) self.ph_oldvpred = tf.placeholder(tf.float32, [None, None]) self.ph_lr = tf.placeholder(tf.float32, []) self.ph_cliprange = tf.placeholder(tf.float32, []) neglogpac = self.stochpol.pd.neglogp(self.stochpol.ph_ac) entropy = tf.reduce_mean(self.stochpol.pd.entropy()) vpred = self.stochpol.vpred if hps['num_vf']==2: # Separate vf_loss for intrinsic and extrinsic rewards vf_loss_int = 0.5 * tf.reduce_mean(tf.square(self.stochpol.vpred_int - self.ph_ret_int)) vf_loss_ext = 0.5 * tf.reduce_mean(tf.square(self.stochpol.vpred_ext - self.ph_ret_ext)) vf_loss = vf_loss_int + vf_loss_ext else: vf_loss = 0.5 * tf.reduce_mean((vpred - self.ph_ret) ** 2) ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old negadv = - self.ph_adv pg_losses1 = negadv * ratio pg_losses2 = negadv * tf.clip_by_value(ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange) pg_loss_surr = tf.maximum(pg_losses1, pg_losses2) pg_loss = tf.reduce_mean(pg_loss_surr) ent_loss = (- ent_coef) * entropy approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.ph_oldnlp)) clipfrac = tf.reduce_mean(tf.to_float(tf.abs(pg_losses2 - pg_loss_surr) > 1e-6)) self.total_loss = pg_loss + ent_loss + vf_loss self.to_report = {'tot': self.total_loss, 'pg': pg_loss, 'vf': vf_loss, 'ent': entropy, 'approxkl': approxkl, 'clipfrac': clipfrac} def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump)] self.rollout = Rollout(hps=self.hps, ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_advs_int = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_advs_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets_int = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time() def stop_interaction(self): for env in self.envs: env.close() def update(self): # Rewards normalization # if self.normrew: # rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T]) # rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count) # rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) # Intrinsic Rewards Normalization if self.normrew: rffs_int = np.array([self.rff.update(rew) for rew in self.rollout.buf_int_rews.T]) self.rff_rms.update(rffs_int.ravel()) int_rews = self.rollout.buf_int_rews / np.sqrt(self.rff_rms.var) else: int_rews = np.copy(self.rollout.buf_int_rews) mean_int_rew = np.mean(int_rews) max_int_rew = np.max(int_rews) # Do not normalize extrinsic rewards ext_rews = self.rollout.buf_ext_rews nsteps = self.rollout.nsteps # If separate value fcn are used if self.hps['num_vf']==2: #Calculate intrinsic returns and advantages. lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 if self.use_news: nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last else: nextnew = 0 # No dones for intrinsic rewards with self.use_news=False nextvals = self.rollout.buf_vpreds_int[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_int_last nextnotnew = 1 - nextnew delta = int_rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds_int[:, t] self.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam self.buf_rets_int[:] = self.buf_advs_int + self.rollout.buf_vpreds_int #Calculate extrinsic returns and advantages. lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last nextvals = self.rollout.buf_vpreds_ext[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_ext_last nextnotnew = 1 - nextnew delta = ext_rews[:, t] + self.gamma_ext * nextvals * nextnotnew - self.rollout.buf_vpreds_ext[:, t] self.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam self.buf_rets_ext[:] = self.buf_advs_ext + self.rollout.buf_vpreds_ext #Combine the extrinsic and intrinsic advantages. self.buf_advs = self.int_coeff*self.buf_advs_int + self.ext_coeff*self.buf_advs_ext else: #Calculate mixed intrinsic and extrinsic returns and advantages. rews = self.rollout.buf_rews = self.rollout.reward_fun(int_rew=int_rews, ext_rew=ext_rews) lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last nextnotnew = 1 - nextnew delta = rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t] self.buf_advs[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds info = dict( # advmean=self.buf_advs.mean(), # advstd=self.buf_advs.std(), recent_best_ext_ret=self.rollout.current_max, recent_best_eplen = self.rollout.current_minlen, recent_worst_eplen = self.rollout.current_maxlen ) if self.hps['num_vf'] ==2: info['retmean_int']=self.buf_rets_int.mean() info['retmean_ext']=self.buf_rets_ext.mean() info['retstd_int']=self.buf_rets_int.std() info['retstd_ext']=self.buf_rets_ext.std() info['vpredmean_int']=self.rollout.buf_vpreds_int.mean() info['vpredmean_ext']=self.rollout.buf_vpreds_ext.mean() info['vpredstd_int']=self.rollout.buf_vpreds_int.std() info['vpredstd_ext']=self.rollout.buf_vpreds_ext.std() info['ev_int']=explained_variance(self.rollout.buf_vpreds_int.ravel(), self.buf_rets_int.ravel()) info['ev_ext']=explained_variance(self.rollout.buf_vpreds_ext.ravel(), self.buf_rets_ext.ravel()) info['rew_int_mean']=mean_int_rew info['recent_best_int_rew']=max_int_rew else: # info['retmean']=self.buf_rets.mean() # info['retstd']=self.buf_rets.std() # info['vpredmean']=self.rollout.buf_vpreds.mean() # info['vpredstd']=self.rollout.buf_vpreds.std() info['rew_mean']=np.mean(self.rollout.buf_rews) info['eplen_std']=np.std(self.rollout.statlists['eplen']) info['eprew_std']=np.std(self.rollout.statlists['eprew']) # info['ev']=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret info['best_eplen'] = self.rollout.best_eplen # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) #Create feed_dict for optimization. ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_adv, resh(self.buf_advs)), ] if self.hps['num_vf']==2: ph_buf.extend([ (self.ph_ret_int, resh(self.buf_rets_int)), (self.ph_ret_ext, resh(self.buf_rets_ext)), ]) else: ph_buf.extend([ (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_ret, resh(self.buf_rets)), ]) ph_buf.extend([ (self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape])) ]) #Optimizes on current data for several epochs. mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange}) mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1]) mblossvals = [mblossvals[0]] # info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) # info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()}) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() # info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start # info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info def step(self): self.rollout.collect_rollout() update_info = self.update() return {'update': update_info} def get_var_values(self): return self.stochpol.get_var_values() def set_var_values(self, vv): self.stochpol.set_var_values(vv)
class CNNBase(NNBase): def __init__(self, num_inputs, input_size, action_space, hidden_size=512, recurrent=False, device='cpu'): super(CNNBase, self).__init__(recurrent, num_inputs, hidden_size) self.device = device self.action_space = action_space h, w = input_size self.conv1 = nn.Conv2d(num_inputs, 32, kernel_size=8, stride=4) w_out = conv2d_size_out(w, kernel_size=8, stride=4) h_out = conv2d_size_out(h, kernel_size=8, stride=4) self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) w_out = conv2d_size_out(w_out, kernel_size=4, stride=2) h_out = conv2d_size_out(h_out, kernel_size=4, stride=2) self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1) w_out = conv2d_size_out(w_out, kernel_size=3, stride=1) h_out = conv2d_size_out(h_out, kernel_size=3, stride=1) init_cnn_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) self.cnn_trunk = nn.Sequential( init_cnn_(self.conv1), nn.ReLU(), init_cnn_(self.conv2), nn.ReLU(), init_cnn_(self.conv3), nn.ReLU(), Flatten(), init_cnn_(nn.Linear(32 * h_out * w_out, hidden_size)), nn.ReLU()) init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__( nn.Linear(hidden_size + self.action_space.n, hidden_size // 2)), nn.Tanh(), init__(nn.Linear(hidden_size // 2, hidden_size // 2)), nn.Tanh(), init__(nn.Linear(hidden_size // 2, 1))) self.optimizer = torch.optim.Adam(self.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) def update(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] policy_state_embedding = self.cnn_trunk(policy_state / 255.0) policy_d = self.trunk( torch.cat([ policy_state_embedding, torch.nn.functional.one_hot( policy_action, self.action_space.n).squeeze(1).float() ], dim=1)) expert_state, expert_action = expert_batch expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_state_embedding = self.cnn_trunk(expert_state / 255.0) expert_d = self.trunk( torch.cat([expert_state_embedding, expert_action], dim=1)) expert_loss = F.binary_cross_entropy_with_logits( expert_d, torch.ones(expert_d.size()).to(self.device)) policy_loss = F.binary_cross_entropy_with_logits( policy_d, torch.zeros(policy_d.size()).to(self.device)) gail_loss = expert_loss + policy_loss loss += gail_loss.item() n += 1 self.optimizer.zero_grad() gail_loss.backward() self.optimizer.step() return loss / n def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() state_embedding = self.cnn_trunk(state / 255.) d = self.trunk( torch.cat([ state_embedding, torch.nn.functional.one_hot( action, self.action_space.n).squeeze(1).float() ], dim=1)) s = torch.sigmoid(d) reward = s.log() - (1 - s).log() if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
class VecNormalize(ABC): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): self.venv = venv self.num_envs = venv.num_envs self.observation_space = venv.observation_space self.action_space = venv.action_space self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs) def close(self): if self.closed: return if self.viewer is not None: self.viewer.close() self.close_extras() self.closed = True def step(self, actions): """ Step the environments synchronously. This is available for backwards compatibility. """ self.step_async(actions) return self.step_wait()
class Runner(AbstractEnvRunner): """ We use this object to make a mini batch of experiences __init__: - Initialize the runner run(): - Make a mini batch """ def __init__(self, *, env, model, nsteps, gamma, lam): super().__init__(env=env, model=model, nsteps=nsteps) # Lambda used in GAE (General Advantage Estimation) self.lam = lam # Discount rate self.gamma = gamma self.clipob = 10. self.cliprew = 10. self.epsilon = 1e-8 self.ret = 0 self.ob_rms = RunningMeanStd(shape=self.env.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) def obfilt(self, obs): obs = np.clip( (obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs def rewfilt(self, rews): rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return rews def run(self): # Here, we init the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs , mb_means , mb_logstds = [],[],[],[],[],[],[],[] mb_states = self.states epinfos = [] # For n in range number of steps for _ in range(self.nsteps): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init actions, values, self.states, neglogpacs = self.model.step( self.obfilt(self.obs), S=self.states, M=self.dones) means, logstds = self.model.meanlogstd(self.obfilt(self.obs)) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(self.dones) mb_means.append(means) mb_logstds.append(logstds) # Take actions in env and look the results # Infos contains a ton of useful informations self.obs[:], rewards, self.dones, infos = self.env.step(actions) self.ob_rms.update(self.obs) self.ret = self.ret * self.gamma + rewards self.ret_rms.update(self.ret) if self.dones: self.ret = 0 for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_means = np.asarray(mb_means) mb_logstds = np.asarray(mb_logstds) # mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) # last_values = self.model.value(self.obs, S=self.states, M=self.dones) # discount/bootstrap off value fn # mb_returns = np.zeros_like(mb_rewards) # mb_advs = np.zeros_like(mb_rewards) # lastgaelam = 0 # for t in reversed(range(self.nsteps)): # if t == self.nsteps - 1: # nextnonterminal = 1.0 - self.dones # nextvalues = last_values # else: # nextnonterminal = 1.0 - mb_dones[t+1] # nextvalues = mb_values[t+1] # delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] # mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam # mb_returns = mb_advs + mb_values return (*map(sf01, (mb_obs, mb_rewards, mb_dones, mb_actions, mb_neglogpacs, mb_means, mb_logstds)), self.obs, self.dones, epinfos)