Ejemplo n.º 1
0
class MeanStdNormalizer(BaseNormalizer):
    def __init__(self, read_only=False, clip=10.0, epsilon=1e-8):
        BaseNormalizer.__init__(self, read_only)
        self.read_only = read_only
        self.rms = None
        self.clip = clip
        self.epsilon = epsilon

    def __call__(self, x):
        from baselines.common.running_mean_std import RunningMeanStd
        x = np.asarray(x)
        if self.rms is None:
            self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:])
        if not self.read_only:
            self.rms.update(x)
        return np.clip(
            (x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon),
            -self.clip, self.clip)

    def state_dict(self):
        return {'mean': self.rms.mean, 'var': self.rms.var}

    def load_state_dict(self, saved):
        self.rms.mean = saved['mean']
        self.rms.var = saved['var']
Ejemplo n.º 2
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 use_tf=False):
        VecEnvWrapper.__init__(self, venv)
        if use_tf:
            from baselines.common.running_mean_std import TfRunningMeanStd
            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                           scope='ob_rms') if ob else None
            self.ret_rms = TfRunningMeanStd(shape=(),
                                            scope='ret_rms') if ret else None
        else:
            from baselines.common.running_mean_std import RunningMeanStd
            self.ob_rms = RunningMeanStd(
                shape=self.observation_space.shape) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 3
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=0):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)

    @property
    def n_active_envs(self):
        return self.venv.n_active_envs

    def set_active_envs(self, active_idx):
        self.venv.set_active_envs(active_idx)
Ejemplo n.º 4
0
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        # self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.ret_rms = None  # reward shouldn't normalize without reset
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        """
        Reset all environments
        """
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 5
0
class bVecNormalize(VecEnv):
    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnv.__init__(self,
                        observation_space=venv.observation_space,
                        action_space=venv.action_space)
        print('bullet vec normalize 초기화 입니다. ')
        self.venv = venv
        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(1)   # TODO, self.num_envs
        self.gamma = gamma
        self.epsilon = epsilon


    def step(self, action):
        return self.step_norm(action)

    def step_norm(self, action):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step(action)     # 각 robot에서 정의된 step()이 호출됨
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        return obs, rews, news, infos


    def _obfilt(self, obs):
        if self.ob_rms:
            # TODO, ret_rms가 정의되어 있지 않으면 enjoy모드로 간주하여 update안함
            self.ob_rms.update(obs) if self.ret_rms else None
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            return obs
        else:
            return obs


    def reset(self):
        obs = self.venv.reset()
        return self._obfilt(obs)

    def set_target(self, target_pos):
        self.venv.set_target(target_pos)


    def get_state(self):
        return self.venv.get_state()
Ejemplo n.º 6
0
class VecNormalize(VecEnvWrapper):
    def __init__(self,
                 venv,
                 norm_obs=True,
                 norm_reward=True,
                 clip_obs=10.,
                 clip_reward=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        """
        A rolling average, normalizing, vectorized wrapepr for environment base class
        
        :param venv: ([Gym Environment]) the list of environments to vectorize and normalize
        :param norm_obs: (bool) normalize observation
        :param norm_reward: (bool) normalize reward with discounting (r = sum(r_old) * gamma + r_new)
        :param clip_obs: (float) clipping value for nomalizing observation
        :param clip_reward: (float) clipping value for nomalizing reward
        :param gamma: (float) discount factor
        :param epsilon: (float) epsilon value to avoid arithmetic issues
        """
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if norm_obs else None
        self.ret_rms = RunningMeanStd(shape=()) if norm_reward else None
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rewards, dones, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rewards
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rewards = np.clip(
                rewards / np.sqrt(self.ret_rms.var + self.epsilon),
                -self.clip_reward, self.clip_reward)
        return obs, rewards, dones, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clip_obs, self.clip_obs)
            return obs
        else:
            return obs

    def reset(self):
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 7
0
class VecNormalize(VecEnvWrapper):
    def __init__(self,
                 venv,
                 visual_obs=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.spaces['visual'].shape
        ) if visual_obs else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = True

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs['visual'] = self._obfilt(obs['visual'])
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs, update=True):
        if self.ob_rms:
            if self.training and update:
                self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        obs['visual'] = self._obfilt(obs['visual'])
        return obs

    def train(self):
        self.training = True

    def eval(self):
        self.training = False
Ejemplo n.º 8
0
class VecNormalizeRewards(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 eval=False):
        VecEnvWrapper.__init__(self, venv)
        self.ret_rms = RunningMeanStd(shape=())
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.eval = eval

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        if self.ret_rms:
            if not self.eval:
                self.ret_rms.update(self.ret)
            rews = np.clip((rews - self.ret_rms.mean) /
                           np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def reset(self):
        print("Env resetting!!!!!!!!!!!!!!")
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return obs

    def save(self, loc):
        s = {}
        if self.ret_rms:
            s['ret_rms'] = self.ret_rms

        import pickle
        with open(loc + '.env_stat.pkl', 'wb') as f:
            pickle.dump(s, f)

    def load(self, loc):
        import pickle
        with open(loc + '.env_stat.pkl', 'rb') as f:
            s = pickle.load(f)

        if self.ret_rms:
            self.ret_rms = s['ret_rms']
Ejemplo n.º 9
0
def test_runningmeanstd():
    for (x1, x2, x3) in [
        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
    ]:
        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])

        x = np.concatenate([x1, x2, x3], axis=0)
        ms1 = [x.mean(axis=0), x.var(axis=0)]
        rms.update(x1)
        rms.update(x2)
        rms.update(x3)
        ms2 = [rms.mean, rms.var]

        assert np.allclose(ms1, ms2)
Ejemplo n.º 10
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(
        self,
        venv,
        ob=False,
        ret=False,
        clipob=10.,
        cliprew=10.,
        gamma=0.99,
        epsilon=1e-8
    ):  # Akhil: add running mean and variance here so the correct mean and var can be inputted here when a model is loaded!
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 11
0
class MeanStdNormalizer:
    def __init__(self, read_only=False, clip=10.0, epsilon=1e-8):

        self.read_only = read_only
        self.rms = None
        self.clip = clip
        self.epsilon = epsilon

    def __call__(self, x):
        x = np.asarray(x)
        if self.rms is None:
            self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:])
        if not self.read_only:
            self.rms.update(x)
        return np.clip((x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon),
                       -self.clip, self.clip)
Ejemplo n.º 12
0
def test_runningmeanstd():
    """Test RunningMeanStd object"""
    for (x_1, x_2, x_3) in [
        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))
    ]:
        rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])

        x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
        moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)]
        rms.update(x_1)
        rms.update(x_2)
        rms.update(x_3)
        moments_2 = [rms.mean, rms.var]

        assert np.allclose(moments_1, moments_2)
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma * (1 - news) + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            tmp = copy.deepcopy(obs)
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            for i in range(len(tmp)):
                obs[i][-6:] = tmp[i][-6:]
            return obs
        else:
            return obs

    def reset(self):
        """
        Reset all environments
        """
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 14
0
class Normalize(gym.Wrapper):
    """
    A  wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self, env, clip_ob=10, clip_rew=10, epsilon=1e-8, gamma=0.99):
        super().__init__(env)
        self.clip_ob = clip_ob
        self.clip_rew = clip_rew
        self._reset_rew()
        self.gamma = gamma
        self.epsilon = epsilon

        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.ret_rms = RunningMeanStd(shape=())

    def step(self, action):
        obs, rew, done, misc = self.env.step(action)
        self.ret = self.ret * self.gamma + rew
        self.ret_rms.update(self.ret)
        rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
                      -self.clip_rew, self.clip_rew)
        if done:
            self._reset_rew()

        obs = self._ob_filter(obs)
        return obs, rew, done, misc

    def reset(self):
        self._reset_rew()
        obs = self.env.reset()
        return self._ob_filter(obs)

    def _ob_filter(self, obs):
        self.ob_rms.update(obs)
        obs = np.clip(
            (obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon),
            -self.clip_ob, self.clip_ob)
        return obs

    def _reset_rew(self):
        self.ret = np.zeros((1, ), dtype=np.float32)
Ejemplo n.º 15
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """

    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 16
0
class Discriminator(nn.Module):
    def __init__(self, state_dim, action_dim, user_dim, device, lr):
        super(Discriminator, self).__init__()

        self.device = device

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.label_embedding = nn.Embedding(10, 10)
        self.prefc1 = nn.Linear(user_dim, 25)

        self.linear = nn.Linear(state_dim * 6 + action_dim, 81)
        self.relu = nn.LeakyReLU(0.2, inplace=True)
        self.conv1 = nn.Conv2d(1, 2, 3)
        self.pool = nn.MaxPool2d(2, 1)
        self.conv2 = nn.Conv2d(2, 20, 3)
        self.conv2_bn = nn.BatchNorm2d(20)
        self.fc1 = nn.Linear(180, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 1)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)

    def forward(self, state, user, label):
        label = label.view(label.size(0)).long()
        user = self.prefc1(user).view(user.size(0), -1)
        x = state.view(state.size(0), -1)
        x = torch.cat((state, user), dim=1).view(state.size(0), -1)
        x = torch.cat((x.view(x.size(0), -1), self.label_embedding(label)),
                      dim=1)
        x = self.relu(self.linear(x))
        x = x.view(x.size(0), 1, 9, 9)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2_bn(self.conv2(x))))
        x = x.view(-1, 180)
        x = F.leaky_relu(self.fc1(x), 0.2)
        x = F.leaky_relu(self.fc2(x), 0.2)
        x = self.fc3(x)
        return torch.sigmoid(x)

    def update(self, expert_loader, rollouts):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_user, policy_action = policy_batch[
                0], policy_batch[1], policy_batch[2]
            policy_d = self.forward(policy_state, policy_user,
                                    policy_action.float())

            expert_state, expert_user, expert_action = expert_batch
            expert_state = expert_state.float().to(self.device)
            expert_user = expert_user.view(
                (expert_user.shape[0], -1)).float().to(self.device)
            expert_action = expert_action.view(
                (expert_state.shape[0], -1)).float().to(self.device)
            expert_d = self.forward(expert_state, expert_user, expert_action)

            expert_loss = F.binary_cross_entropy_with_logits(
                expert_d,
                torch.ones(expert_d.size()).to(self.device))
            policy_loss = F.binary_cross_entropy_with_logits(
                policy_d,
                torch.zeros(policy_d.size()).to(self.device))

            gail_loss = expert_loss + policy_loss

            loss += gail_loss.item()
            n += 1

            self.optimizer.zero_grad()
            gail_loss.backward()
            self.optimizer.step()
        return loss / n

    def predict_reward(self, state, user, action, gamma, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.forward(state, user, action.float())
            s = torch.sigmoid(d)
            reward = s.log() - (1 - s).log()
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
Ejemplo n.º 17
0
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 use_tf=False):
        VecEnvWrapper.__init__(self, venv)
        if use_tf:
            from baselines.common.running_mean_std import TfRunningMeanStd
            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                           scope='ob_rms') if ob else None
            self.ret_rms = TfRunningMeanStd(shape=(),
                                            scope='ret_rms') if ret else None
        else:
            from baselines.common.running_mean_std import RunningMeanStd
            self.ob_rms = RunningMeanStd(
                shape=self.observation_space.shape) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def step_wait_collisions(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, collisions, infos = self.venv.step_wait_collisions()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, collisions, infos

    def step_wait_runtime(self):
        obs, rews, news, infos = self.venv.step_wait_runtime()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def _obfilt_run(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip(
                self.ob_rms.mean +
                obs * np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob,
                self.clipob)

        return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Ejemplo n.º 18
0
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, device, reward_type, update_rms, cliprew_down=-10.0, cliprew_up=10.0):
        super(Discriminator, self).__init__()
        self.cliprew_down = cliprew_down
        self.cliprew_up = cliprew_up
        self.device = device
        self.reward_type = reward_type
        self.update_rms = update_rms

        # self.trunk = nn.Sequential(
        #     nn.Linear(input_dim, hidden_dim), nn.Tanh(),
        #     nn.Linear(hidden_dim, hidden_dim), nn.Tanh(),
        #     nn.Linear(hidden_dim, 1), nn.Tanh()).to(device)

        self.trunk = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim), nn.Tanh(),
            nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

    def compute_grad_pen(self,
                         expert_state,
                         expert_action,
                         policy_state,
                         policy_action,
                         lambda_=10):
        alpha = torch.rand(expert_state.size(0), 1)
        expert_data = torch.cat([expert_state, expert_action], dim=1)
        policy_data = torch.cat([policy_state, policy_action], dim=1)

        alpha = alpha.expand_as(expert_data).to(expert_data.device)

        mixup_data = alpha * expert_data + (1 - alpha) * policy_data
        mixup_data.requires_grad = True

        disc = self.trunk(mixup_data)
        ones = torch.ones(disc.size()).to(disc.device)
        grad = autograd.grad(
            outputs=disc,
            inputs=mixup_data,
            grad_outputs=ones,
            create_graph=True,
            retain_graph=True,
            only_inputs=True)[0]

        grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
        return grad_pen

    def update_zm(self, replay_buf, expert_buf, obsfilt=None, batch_size=128):
        self.train()
        obs = replay_buf.obs
        obs_batch = obs[:-1].view(-1, *obs.size()[2:])
        states = obs_batch.cpu().detach().numpy()

        # states = np.concatenate(states,axis=1)
        actions = replay_buf.actions
        actions_batch = actions.view(-1,actions.size(-1))
        actions = actions_batch.cpu().detach().numpy()

        policy_buf = Dset(inputs=states[0:len(actions)], labels=actions, randomize=True)

        loss = 0
        g_loss =0.0
        gp =0.0
        n = 0

        # loss = 0

        # Sample replay buffer
        policy_state, policy_action = policy_buf.get_next_batch(batch_size)
        policy_state = torch.FloatTensor(policy_state).to(self.device)
        policy_action = torch.FloatTensor(policy_action).to(self.device)
        temp=[policy_state, policy_action]
        policy_d = self.trunk(torch.cat([policy_state, policy_action], dim=1))

        # Sample expert buffer
        expert_state, expert_action = expert_buf.get_next_batch(batch_size)
        expert_state = obsfilt(expert_state, update=False)
        expert_state = torch.FloatTensor(expert_state).to(self.device)
        expert_action = torch.FloatTensor(expert_action).to(self.device)
        expert_d = self.trunk(torch.cat([expert_state, expert_action], dim=1))

        # expert_loss = F.binary_cross_entropy_with_logits(
        #     expert_d,
        #     torch.ones(expert_d.size()).to(self.device))
        # policy_loss = F.binary_cross_entropy_with_logits(
        #     policy_d,
        #     torch.zeros(policy_d.size()).to(self.device))

        # expert_loss = torch.mean(expert_d).to(self.device)
        # policy_loss = torch.mean(policy_d).to(self.device)

        expert_loss = torch.mean(torch.tanh(expert_d)).to(self.device)
        policy_loss = torch.mean(torch.tanh(policy_d)).to(self.device)

        # gail_loss = expert_loss + policy_loss
        wd = expert_loss - policy_loss
        grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                         policy_state, policy_action)

        # loss += (gail_loss + grad_pen).item()
        loss += (-wd + grad_pen).item()
        g_loss += (wd).item()
        gp += (grad_pen).item()
        n += 1

        self.optimizer.zero_grad()
        # (gail_loss + grad_pen).backward()
        (-wd + grad_pen).backward()
        self.optimizer.step()


        return g_loss/n, gp/n, 0.0, loss / n

    def update(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        g_loss =0.0
        gp =0.0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action = policy_batch[0], policy_batch[2]
            policy_d = self.trunk(
                torch.cat([policy_state, policy_action], dim=1))

            expert_state, expert_action = expert_batch
            expert_state = obsfilt(expert_state.numpy(), update=False)
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_d = self.trunk(
                torch.cat([expert_state, expert_action], dim=1))

            # expert_loss = F.binary_cross_entropy_with_logits(
            #     expert_d,
            #     torch.ones(expert_d.size()).to(self.device))
            # policy_loss = F.binary_cross_entropy_with_logits(
            #     policy_d,
            #     torch.zeros(policy_d.size()).to(self.device))

            # expert_loss = torch.mean(expert_d).to(self.device)
            # policy_loss = torch.mean(policy_d).to(self.device)

            expert_loss = torch.mean(torch.tanh(expert_d)).to(self.device)
            policy_loss = torch.mean(torch.tanh(policy_d)).to(self.device)

            # gail_loss = expert_loss + policy_loss
            wd = expert_loss - policy_loss
            grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                             policy_state, policy_action)

            # loss += (gail_loss + grad_pen).item()
            loss += (-wd + grad_pen).item()
            g_loss += (wd).item()
            gp += (grad_pen).item()
            n += 1

            self.optimizer.zero_grad()
            # (gail_loss + grad_pen).backward()
            (-wd + grad_pen).backward()
            self.optimizer.step()

        return g_loss/n, gp/n, 0.0, loss / n

    def update_origin(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        g_loss =0.0
        gp =0.0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action = policy_batch[0], policy_batch[2]
            policy_d = self.trunk(
                torch.cat([policy_state, policy_action], dim=1))

            expert_state, expert_action = expert_batch
            expert_state = obsfilt(expert_state.numpy(), update=False)
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_d = self.trunk(
                torch.cat([expert_state, expert_action], dim=1))

            # expert_loss = F.binary_cross_entropy_with_logits(
            #     expert_d,
            #     torch.ones(expert_d.size()).to(self.device))
            # policy_loss = F.binary_cross_entropy_with_logits(
            #     policy_d,
            #     torch.zeros(policy_d.size()).to(self.device))

            expert_loss = torch.mean(expert_d).to(self.device)
            policy_loss = torch.mean(policy_d).to(self.device)

            # gail_loss = expert_loss + policy_loss
            wd = expert_loss - policy_loss
            grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                             policy_state, policy_action)

            # loss += (gail_loss + grad_pen).item()
            loss += (-wd + grad_pen).item()
            g_loss += (wd).item()
            gp += (grad_pen).item()
            n += 1

            self.optimizer.zero_grad()
            # (gail_loss + grad_pen).backward()
            (-wd + grad_pen).backward()
            self.optimizer.step()

        return g_loss/n, gp/n, 0.0, loss / n

    def update_zm_origin(self, replay_buf, expert_buf, obsfilt=None, batch_size=128):
        self.train()
        obs = replay_buf.obs
        obs_batch = obs[:-1].view(-1, *obs.size()[2:])
        states = obs_batch.cpu().detach().numpy()

        # states = np.concatenate(states,axis=1)
        actions = replay_buf.actions
        actions_batch = actions.view(-1,actions.size(-1))
        actions = actions_batch.cpu().detach().numpy()

        policy_buf = Dset(inputs=states[0:len(actions)], labels=actions, randomize=True)

        # loss = 0

        # Sample replay buffer
        policy_state, policy_action = policy_buf.get_next_batch(batch_size)
        policy_state = torch.FloatTensor(policy_state).to(self.device)
        policy_action = torch.FloatTensor(policy_action).to(self.device)
        temp=[policy_state, policy_action]
        policy_d = self.trunk(torch.cat([policy_state, policy_action], dim=1))

        # Sample expert buffer
        expert_state, expert_action = expert_buf.get_next_batch(batch_size)
        expert_state = obsfilt(expert_state, update=False)
        expert_state = torch.FloatTensor(expert_state).to(self.device)
        expert_action = torch.FloatTensor(expert_action).to(self.device)
        expert_d = self.trunk(torch.cat([expert_state, expert_action], dim=1))

        expert_loss = F.binary_cross_entropy_with_logits(
            expert_d,
            torch.ones(expert_d.size()).to(self.device))
        policy_loss = F.binary_cross_entropy_with_logits(
            policy_d,
            torch.zeros(policy_d.size()).to(self.device))

        gail_loss = expert_loss + policy_loss
        grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                         policy_state, policy_action)
        # print("gail_loss = %s,    gp=%s" % (gail_loss.item(), grad_pen.item()))

        loss = (gail_loss + grad_pen).item()
        # loss = (gail_loss).item()

        self.optimizer.zero_grad()
        (gail_loss + grad_pen).backward()
        # (gail_loss).backward()
        self.optimizer.step()

        return gail_loss.item(), grad_pen.item(), 0.0, loss

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            if self.reward_type == 0:
                s = torch.exp(d)
                reward = s
            elif self.reward_type == 1:
                s = torch.sigmoid(d)
                reward = - (1 - s).log()
            elif self.reward_type == 2:
                s = torch.sigmoid(d)
                reward = s
            elif self.reward_type == 3:
                s = torch.sigmoid(d)
                reward = s.exp()
            elif self.reward_type == 4:
                reward = d
            elif self.reward_type == 5:
                s = torch.sigmoid(d)
                reward = s.log() - (1 - s).log()

            # s = torch.exp(d)
            # # reward = s.log() - (1 - s).log()
            # s = torch.sigmoid(d)
            # reward = s
            # # reward = d
            if self.returns is None:
                self.returns = reward.clone()

            if self.update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())
                return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
            else:
                return reward


            # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
            # return torch.clamp(reward,self.cliprew_down, self.cliprew_up)
            # return reward

    def predict_reward_exp(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            s = torch.exp(d)
            # s = torch.sigmoid(d)
            # reward = s.log() - (1 - s).log()
            reward = s
            # reward = d
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())
            # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
            # return torch.clamp(reward,self.cliprew_down, self.cliprew_up)
            # return reward

    def predict_reward_t1(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            # s = torch.exp(d)
            s = torch.sigmoid(d)
            # reward = s.log() - (1 - s).log()
            reward = - (1 - s).log()
            # reward = d
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())
            # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
            # return torch.clamp(reward,self.cliprew_down, self.cliprew_up)
            return reward

    def predict_reward_origin(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            s = torch.exp(d)
            # s = torch.sigmoid(d)
            # reward = s.log() - (1 - s).log()
            # reward = - (1 - s).log()
            reward = s
            # reward = d
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())
            # ttt = torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return torch.clamp(reward / np.sqrt(self.ret_rms.var[0] + 1e-8), self.cliprew_down, self.cliprew_up)
            # return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
            # return torch.clamp(reward,self.cliprew_down, self.cliprew_up)
            return reward
Ejemplo n.º 19
0
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.spaces[0].shape) if ob else None

        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs_tuple, rews, news, infos = self.venv.step_wait()
        obs_img, obs_measure = self.process_obs(obs_tuple)
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs_img)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        return obs, obs_measure, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        """
        Reset all environments
        """
        obs_tuple = self.venv.reset()
        obs_img, obs_measure = self.process_obs(obs_tuple)
        return self._obfilt(obs_img), obs_measure

    def process_obs(self, obs_tuple):
        obs_tuple = np.array(obs_tuple)
        obs_img = []
        obs_measure = []

        for i in range(obs_tuple.shape[0]):
            obs_img.append(obs_tuple[i][0])
            obs_measure.append(obs_tuple[i][1])

        return np.array(obs_img), np.array(obs_measure)
Ejemplo n.º 20
0
class CNNBase(NNBase):
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=512,
                 embed_size=0,
                 recurrent=False,
                 device='cpu'):

        super(CNNBase, self).__init__(recurrent, num_inputs, hidden_size,
                                      embed_size)

        self.device = device
        self.action_space = action_space

        h, w = input_size
        self.conv1 = nn.Conv2d(num_inputs, 32, kernel_size=8, stride=4)
        w_out = conv2d_size_out(w, kernel_size=8, stride=4)
        h_out = conv2d_size_out(h, kernel_size=8, stride=4)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        w_out = conv2d_size_out(w_out, kernel_size=4, stride=2)
        h_out = conv2d_size_out(h_out, kernel_size=4, stride=2)

        self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1)
        w_out = conv2d_size_out(w_out, kernel_size=3, stride=1)
        h_out = conv2d_size_out(h_out, kernel_size=3, stride=1)

        init_cnn_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                   constant_(x, 0),
                                   nn.init.calculate_gain('relu'))

        self.cnn_trunk = nn.Sequential(
            init_cnn_(self.conv1), nn.ReLU(), init_cnn_(self.conv2), nn.ReLU(),
            init_cnn_(self.conv3), nn.ReLU(), Flatten(),
            init_cnn_(nn.Linear(32 * h_out * w_out, hidden_size)), nn.ReLU())

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(
                nn.Linear(hidden_size + self.action_space.n + embed_size,
                          hidden_size // 2)), nn.Tanh(),
            init__(nn.Linear(hidden_size // 2, hidden_size // 2)), nn.Tanh(),
            init__(nn.Linear(hidden_size // 2, 1)))

        # self.optimizer = torch.optim.Adam(self.parameters(), lr=3e-5)
        self.optimizer = torch.optim.RMSprop(
            self.parameters(), lr=5e-5
        )  # To be conistent with the wgan optimizer, althougt not necessary

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def update(self,
               expert_loader,
               rollouts,
               discr_queue,
               max_grad_norm,
               obsfilt,
               i_iter=0):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action, correlated_embeddings = policy_batch[
                0], policy_batch[2], policy_batch[3]
            loss = torch.tensor(0.0).to(device)
            # Iterate through strategies in the queue.
            # Note: we loaded the parameters in order, therefore we ended up with latest param, which optimizer update.
            if len(discr_queue) < 1:
                return copy.deepcopy(self.state_dict())

            for strategy in discr_queue:
                self.load_state_dict(strategy)

                policy_state_embedding = self.cnn_trunk(policy_state / 255.0)
                policy_d = self.trunk(
                    torch.cat([
                        policy_state_embedding,
                        torch.nn.functional.one_hot(
                            policy_action,
                            self.action_space.n).squeeze(1).float(),
                        correlated_embeddings
                    ],
                              dim=1))

                expert_state, expert_action = expert_batch
                expert_state = torch.FloatTensor(expert_state).to(self.device)
                expert_action = expert_action.to(self.device)

                expert_state_embedding = self.cnn_trunk(expert_state / 255.)
                expert_d = self.trunk(
                    torch.cat([
                        expert_state_embedding, expert_action,
                        correlated_embeddings
                    ],
                              dim=1))

                expert_loss = -expert_d.mean()

                policy_loss = policy_d.mean()

                loss = loss + expert_loss + policy_loss

            loss = loss / len(discr_queue)

            self.optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self.parameters(), max_grad_norm)
            self.optimizer.step()

        return copy.deepcopy(self.state_dict())

    def predict_strategy_reward(self, state, action, embedding, gamma, masks,
                                update_rms):
        with torch.no_grad():
            self.eval()
            state_embedding = self.cnn_trunk(state / 255.)
            d = self.trunk(
                torch.cat([
                    state_embedding,
                    torch.nn.functional.one_hot(
                        action, self.action_space.n).squeeze(1).float(),
                    embedding
                ],
                          dim=1))

            reward = d

            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())
            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)

    def predict_reward(self,
                       state,
                       action,
                       embedding,
                       gamma,
                       masks,
                       discr_queue,
                       update_rms=True):
        """
        :param state:
        :param action:
        :param gamma:
        :param masks:
        :param discrim_queue:
        :param update_rms:
        :return: returns actor_reward
        """
        actor_reward = gains = 0.0
        strategy_rewards = []
        if len(discr_queue) > 0:
            for strategy in discr_queue:
                self.load_state_dict(strategy)
                reward = self.predict_strategy_reward(state, action, embedding,
                                                      gamma, masks, update_rms)
                strategy_rewards.append(reward)

            # gain gets used by correlator to compute maxEnt corEQ loss.
            # It quantifies, how much overall gain would be achieved by switching strategies.
            for i in range(len(strategy_rewards)):
                for j in range(i + 1, len(strategy_rewards)):
                    gains = gains - torch.pow(
                        strategy_rewards[i] - strategy_rewards[j], 2)

            gains = gains / (len(discr_queue) * len(discr_queue) / 4)
            actor_reward = strategy_rewards[-1]

        return actor_reward, gains
Ejemplo n.º 21
0
class MLPBase(NNBase):
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=64,
                 embed_size=0,
                 recurrent=False,
                 device='cpu'):
        super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size,
                                      embed_size)

        self.device = device

        if recurrent:
            num_inputs = hidden_size

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(
                nn.Linear(num_inputs + action_space.shape[0] + embed_size,
                          hidden_size)), nn.Tanh(),
            init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
            init__(nn.Linear(hidden_size, 1)))

        # self.optimizer = torch.optim.Adam(self.parameters(), lr= 3e-5)
        self.optimizer = torch.optim.RMSprop(self.parameters(), lr=5e-5)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.train()

    def update(self,
               expert_loader,
               rollouts,
               discr_queue,
               max_grad_norm,
               obsfilt,
               i_iter=0):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.train()

        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action, embeddings = policy_batch[
                0], policy_batch[2], policy_batch[3]
            loss = torch.tensor(0.0).to(device)
            # Iterate through strategies in the queue.
            # Note: we loaded the parameters in order, therefore we end up with latest param, which optimizer updates.
            if len(discr_queue) < 1:
                return copy.deepcopy(self.state_dict())

            for strategy in discr_queue:
                self.load_state_dict(strategy)
                policy_d = self.trunk(
                    torch.cat([policy_state, policy_action, embeddings],
                              dim=1))

                expert_state, expert_action = expert_batch
                expert_state = obsfilt(expert_state.numpy(), update=False)
                expert_state = torch.FloatTensor(expert_state).to(self.device)
                expert_action = expert_action.to(self.device)
                expert_d = self.trunk(
                    torch.cat([expert_state, expert_action, embeddings],
                              dim=1))

                expert_loss = -expert_d.mean()

                policy_loss = policy_d.mean()

                loss = loss + expert_loss + policy_loss

            loss = loss / len(discr_queue)

            self.optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(
                self.parameters(), max_grad_norm
            )  # not necessary but it is conistently used across all NN modeules.
            self.optimizer.step()

        return copy.deepcopy(self.state_dict())

    def predict_strategy_reward(self, state, action, embedding, gamma, masks,
                                update_rms):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action, embedding], dim=1))
            reward = d

            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)

    def predict_reward(self,
                       state,
                       action,
                       embedding,
                       gamma,
                       masks,
                       discr_queue,
                       update_rms=True):
        """
        :param state:
        :param action:
        :param gamma:
        :param masks:
        :param discrim_queue:
        :param update_rms:
        :return: returns actor_reward
        """
        actor_reward = gains = 0.0
        strategy_rewards = []
        if len(discr_queue) > 0:
            for strategy in discr_queue:
                self.load_state_dict(strategy)
                reward = self.predict_strategy_reward(state, action, embedding,
                                                      gamma, masks, update_rms)
                strategy_rewards.append(reward)

            # gain gets used by correlator to compute maxEnt corEQ loss.
            # It quantifies, how much overall gain would be achieved by switching strategies.
            for i in range(len(strategy_rewards)):
                for j in range(i + 1, len(strategy_rewards)):
                    gains = gains - torch.pow(
                        strategy_rewards[i] - strategy_rewards[j], 2)

            gains = gains / (len(discr_queue) * len(discr_queue) / 4)
            actor_reward = strategy_rewards[-1]

        return actor_reward, gains
Ejemplo n.º 22
0
class Discriminator(nn.Module):
    """
    Modified GAIL Discriminator to handle graph state, and composite actions
    """
    def __init__(self, input_dim, hidden_dim, device):
        super(Discriminator, self).__init__()

        self.device = device

        self.encoder = WoBObservationEncoder(out_dim=hidden_dim)
        self.trunk_fn = nn.Sequential(nn.Linear(hidden_dim, 1))

        self.train()
        self.to(device)

        self.optimizer = torch.optim.Adam(self.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

    def forward(self, inputs, votes):
        x = self.encoder(inputs, votes)
        return self.trunk_fn(x)

    def trunk(self, state, action):
        batch_size = state.batch.max().item() + 1
        votes = torch.zeros(state.value.shape[0], 1)
        past = 0
        for b_idx in range(batch_size):
            _m = state.batch == b_idx
            _size = _m.sum().item()
            votes[past + action[b_idx], 0] = 1
            past += _size
        return self.forward(state, votes)

    def compute_grad_pen(self,
                         expert_state,
                         expert_action,
                         policy_state,
                         policy_action,
                         lambda_=10):
        # merge graphs, apply alpha to vote shares
        mixup_state = Batch()
        for key, value in expert_state:
            assert isinstance(key, str), str(key)
            if key in ("edge_index", "edge_attr"):
                continue
            mixup_state[key] = torch.cat(
                [expert_state[key], policy_state[key]])
        mixup_state.edge_index = torch.cat(
            [
                expert_state.edge_index,
                policy_state.edge_index + expert_state.batch.shape[0],
            ],
            dim=1,
        )

        alpha = torch.rand(expert_action.size(0))
        batch_size = expert_state.batch.max().item() + 1
        mixup_votes = []
        for i in range(batch_size):
            _em = expert_state.batch == i
            _pm = policy_state.batch == i
            votes = torch.zeros((_em.sum() + _pm.sum()).item())
            assert votes.shape[0]
            votes[expert_action[i]] = alpha[i]
            votes[policy_action[i] + _em.sum().item()] = 1 - alpha[i]
            mixup_votes.append(votes)
        mixup_action = torch.cat(mixup_votes).view(-1, 1)
        mixup_action.requires_grad = True

        disc = self.forward(mixup_state, mixup_action)
        ones = torch.ones(disc.size()).to(disc.device)
        inputs = [mixup_action]
        for key, value in mixup_state:
            if value.dtype == torch.float:
                value.requires_grad = True
                inputs.append(value)
        grad = autograd.grad(
            outputs=disc,
            inputs=inputs,
            grad_outputs=ones,
            create_graph=True,
            retain_graph=True,
            only_inputs=True,
            allow_unused=True,
        )[0]

        grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
        return grad_pen

    def update(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)
        assert len(expert_loader)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action = policy_batch[0], policy_batch[2]
            batch_size = policy_state.batch.max().item() + 1
            policy_d = self.trunk(policy_state, policy_action)

            expert_state, expert_action, _ = expert_batch
            # expert_state = obsfilt(expert_state.numpy(), update=False)
            # expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_d = self.trunk(expert_state, expert_action)

            expert_loss = F.binary_cross_entropy_with_logits(
                expert_d,
                torch.ones(expert_d.size()).to(self.device))
            policy_loss = F.binary_cross_entropy_with_logits(
                policy_d,
                torch.zeros(policy_d.size()).to(self.device))

            gail_loss = expert_loss + policy_loss
            grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                             policy_state, policy_action)

            loss += (gail_loss + grad_pen).item()
            n += 1

            self.optimizer.zero_grad()
            (gail_loss + grad_pen).backward()
            self.optimizer.step()
        return loss / n

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(state, action)
            s = torch.sigmoid(d)
            reward = s.log() - (1 - s).log()
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
Ejemplo n.º 23
0
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self, venv, ob=False, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.hier = self.venv.hier
        if self.hier:
            obs_space = self.observation_space.spaces[1]
            self.ob_rms = RunningMeanStd(shape=obs_space.shape) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        else:    
            self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        if self.hier:
            tokens, obs = obs
            obs = self._obfilt(obs)
            obs = (tokens, obs)
        else:
            obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def goal(self, obs):
        return self.venv.goal(obs)

    def action(self, obs):
        return self.venv.action(obs)
    
    def final_obs(self):
        return self.venv.obs_from_buf_final()

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            return obs
        else:
            return obs
        
    def tf_filt(self, obs_tf):
        if self.ob_rms:
            obs_tf = tf.clip_by_value((obs_tf - self.ob_rms.mean) / \
                                      tf.cast(np.sqrt(self.ob_rms.var + self.epsilon), tf.float32),
                                      -self.clipob, self.clipob)
            return obs_tf
        else:
            return obs_tf

    def reset(self):
        """
        Reset all environments
        """
        obs = self.venv.reset()
        if self.hier:
            return obs[0], self._obfilt(obs[1])
        else:
            return self._obfilt(obs)
Ejemplo n.º 24
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        if isinstance(self.observation_space, Dict):
            self.ob_rms = {}
            for key in self.observation_space.spaces.keys():
                self.ob_rms[key] = RunningMeanStd(
                    shape=self.observation_space.spaces[key].shape
                ) if ob else None
        else:
            self.ob_rms = RunningMeanStd(
                shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        def _obfilt(obs, ob_rms):
            if ob_rms:
                ob_rms.update(obs)
                obs = np.clip(
                    (obs - ob_rms.mean) / np.sqrt(ob_rms.var + self.epsilon),
                    -self.clipob, self.clipob)
                return obs
            else:
                return obs

        if isinstance(self.ob_rms, dict):
            for key in self.ob_rms:
                obs[key] = _obfilt(obs[key], self.ob_rms[key])
        else:
            obs = _obfilt(obs, self.ob_rms)

        return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)

    def save_state(self, save_path):
        """
        pickle and save the normalization state variables

        """
        state = {'ob_rms': self.ob_rms, 'ret_rms': self.ret_rms}
        with open(save_path, 'wb') as f:
            pickle.dump(state, f)

    def restore_state(self, load_path):
        """
        unpickle and restore the normalization state variables

        """

        with open(load_path, 'rb') as f:
            state = pickle.load(f)
        self.ob_rms = state['ob_rms']
        self.ret_rms = state['ret_rms']

    def get_obs(self):
        return self._obfilt(self.venv.get_obs())
Ejemplo n.º 25
0
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, device):
        super(Discriminator, self).__init__()

        self.device = device

        self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

    def compute_grad_pen(self,
                         expert_state,
                         expert_action,
                         policy_state,
                         policy_action,
                         lambda_=10):
        alpha = torch.rand(expert_state.size(0), 1)
        expert_data = torch.cat([expert_state, expert_action], dim=1)
        policy_data = torch.cat([policy_state, policy_action], dim=1)

        alpha = alpha.expand_as(expert_data).to(expert_data.device)

        mixup_data = alpha * expert_data + (1 - alpha) * policy_data
        mixup_data.requires_grad = True

        disc = self.trunk(mixup_data)
        ones = torch.ones(disc.size()).to(disc.device)
        grad = autograd.grad(outputs=disc,
                             inputs=mixup_data,
                             grad_outputs=ones,
                             create_graph=True,
                             retain_graph=True,
                             only_inputs=True)[0]

        grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
        return grad_pen

    def update(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action = policy_batch[0], policy_batch[2]
            policy_d = self.trunk(
                torch.cat([policy_state, policy_action], dim=1))

            expert_state, expert_action = expert_batch
            expert_state = obsfilt(expert_state.numpy(), update=False)
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_d = self.trunk(
                torch.cat([expert_state, expert_action], dim=1))

            expert_loss = F.binary_cross_entropy_with_logits(
                expert_d,
                torch.ones(expert_d.size()).to(self.device))
            policy_loss = F.binary_cross_entropy_with_logits(
                policy_d,
                torch.zeros(policy_d.size()).to(self.device))

            gail_loss = expert_loss + policy_loss
            grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                             policy_state, policy_action)

            loss += (gail_loss + grad_pen).item()
            n += 1

            self.optimizer.zero_grad()
            (gail_loss + grad_pen).backward()
            self.optimizer.step()
        return loss / n

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            s = torch.sigmoid(d)
            reward = s.log() - (1 - s).log()
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
Ejemplo n.º 26
0
class AIL():
    def __init__(self,
                 observation_space,
                 action_space,
                 device,
                 args,
                 log_only=False):
        super(AIL, self).__init__()

        if log_only:
            self.m_return_list = self.load_expert_data(args)
            return

        self.lr = args.il_lr  # larger learning rate for MLP
        self.action_dim = action_space.shape[0]
        self.hidden_dim = 100

        self.state_dim = observation_space.shape[0]  #

        self.device = device
        self.create_networks()

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.gail_batch_size = args.gail_batch_size
        self.label_expert = 1
        self.label_policy = -1
        self.reward_std = args.reward_std
        self.gp_lambda = args.gp_lambda
        self.m_return_list = self.make_dataset(args)

        if args.ail_saturate is None and args.ail_loss_type != "unhinged":
            args.ail_saturate = 1

        if args.ail_loss_type == "logistic":
            self.adversarial_loss = Logistic_Loss()
        elif args.ail_loss_type == "unhinged":
            self.adversarial_loss = Unhinged_Loss()
            if args.ail_saturate is None: args.ail_saturate = 0
        elif args.ail_loss_type == "sigmoid":
            self.adversarial_loss = Sigmoid_Loss()
        elif args.ail_loss_type == "nlogistic":
            self.adversarial_loss = Normalized_Logistic_Loss()
        elif args.ail_loss_type == "apl":
            self.adversarial_loss = APL_Loss()
        self.ail_saturate = args.ail_saturate

    def create_networks(self):
        self.trunk = Discriminator(self.state_dim + self.action_dim,
                                   self.hidden_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.trunk.parameters(), lr=self.lr)

    def make_dataset(self, args):
        self.load_expert_data(args)  # h5py demos are loaded into tensor.
        expert_dataset = data_utils.TensorDataset(self.real_state_tensor,
                                                  self.real_action_tensor)

        drop_last = len(expert_dataset) > self.gail_batch_size
        self.expert_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=self.gail_batch_size,
            shuffle=True,  # important to shuffle the dataset. 
            drop_last=drop_last)

    def load_expert_data(self, args, verbose=1):  # also load non-expert data
        model_list = [1.0]
        if args.noise_prior != 0.0:
            model_list += [0.4, 0.3, 0.2, 0.1, 0.0]

        traj_deterministic = args.traj_deterministic
        demo_file_size = 10000

        self.index_worker_idx = []
        m_return_list = []
        index_start = 0
        expert_state_list, expert_action_list, expert_nstate_list, expert_reward_list, expert_mask_list, expert_id_list = [],[],[],[],[],[]

        traj_path = "./imitation_data/%s" % (args.env_name)

        for model_i in range(0, len(model_list)):
            m = model_list[model_i]

            if args.noise_type == "policy":  # policy noise (sub-optimal policies)
                traj_filename = traj_path + (
                    "/%s_TRAJ-N%d_P%0.1f" % (args.env_name, demo_file_size, m))
            elif args.noise_type == "action":  # action noise
                traj_filename = traj_path + (
                    "/%s_TRAJ-N%d_A%0.1f" % (args.env_name, demo_file_size, m))
            if traj_deterministic:
                traj_filename += "_det"
            else:
                traj_filename += "_sto"

            hf = h5py.File(traj_filename + ".h5", 'r')
            expert_mask = hf.get('mask_array')[:]
            expert_state = hf.get('obs_array')[:]
            # expert_nstate = hf.get('nobs_array')[:]
            expert_action = hf.get('act_array')[:]
            expert_reward = hf.get('reward_array')[:]

            step_num = expert_mask.shape[0]
            traj_num = step_num - np.sum(expert_mask)
            m_return = np.sum(expert_reward) / traj_num

            m_return_list += [m_return]

            expert_id = np.ones((expert_mask.shape[0], 1)) * model_i

            if m != 1.0 and args.noise_prior != -1.0:
                if args.noise_prior == 0.5:
                    pair_num = 2000  # 10000 / 5
                if args.noise_prior == 0.4:
                    pair_num = 1500
                if args.noise_prior == 0.3:
                    pair_num = 1000
                if args.noise_prior == 0.2:
                    pair_num = 500
                if args.noise_prior == 0.1:
                    pair_num = 200

                if args.demo_sub_traj:
                    sub_num = pair_num // 50  # each sub traj has 50 sa-pairs.
                    index = []
                    ## data is split into sub_num chunks, and we randomly sample 50 pairs from each chunk.
                    chuck_size = demo_file_size // sub_num
                    indexes_start = np.random.randint(0,
                                                      chuck_size - 50,
                                                      size=sub_num)
                    for i in range(0, sub_num):
                        ii = indexes_start[i] + (i * chuck_size)
                        index.append(np.arange(ii, ii + 50))
                    index = np.hstack(index)
                else:
                    index = np.random.permutation(demo_file_size)[:pair_num]

                expert_mask = expert_mask[index]
                expert_state = expert_state[index]
                expert_action = expert_action[index]
                # expert_nstate = expert_nstate[index]  #next state is not used.
                expert_reward = expert_reward[index]
                expert_id = expert_id[index]

            self.index_worker_idx += [
                index_start + np.arange(0, expert_mask.shape[0])
            ]
            index_start += expert_mask.shape[0]

            expert_mask_list += [expert_mask]
            expert_state_list += [expert_state]
            expert_action_list += [expert_action]
            # expert_nstate_list += [expert_nstate]
            expert_reward_list += [expert_reward]
            expert_id_list += [expert_id]

            if verbose:
                print("%s TRAJ is loaded from %s with full_size %s: using data size %s steps and average return %s" % \
                    (colored(args.noise_type, p_color), colored(traj_filename, p_color), colored(step_num, p_color), colored(expert_state.shape[0] , p_color), \
                    colored( "%.2f" % (m_return), p_color )))

        expert_masks = np.concatenate(expert_mask_list, axis=0)
        expert_states = np.concatenate(expert_state_list, axis=0)
        expert_actions = np.concatenate(expert_action_list, axis=0)
        # expert_nstates = np.concatenate(expert_nstate_list, axis=0)
        expert_rewards = np.concatenate(expert_reward_list, axis=0)
        expert_ids = np.concatenate(expert_id_list, axis=0)

        self.real_mask_tensor = torch.FloatTensor(expert_masks).to(device_cpu)
        self.real_state_tensor = torch.FloatTensor(expert_states).to(
            device_cpu)
        self.real_action_tensor = torch.FloatTensor(expert_actions).to(
            device_cpu)
        # self.real_nstate_tensor = torch.FloatTensor(expert_nstates).to(device_cpu)
        self.real_id_tensor = torch.LongTensor(expert_ids).to(device_cpu)
        self.data_size = self.real_state_tensor.size(0)

        self.worker_num = torch.unique(self.real_id_tensor).size(0)

        print(self.real_state_tensor.size())
        print(self.real_action_tensor.size())

        if verbose:
            print("Total data pairs: %s, state dim %s, action dim %s" % \
                (colored(self.real_state_tensor.size(0), p_color), \
                colored(self.real_state_tensor.size(1), p_color), colored(self.real_action_tensor.size(1), p_color)
                ))
        return m_return_list

    def compute_grad_pen(self,
                         expert_data,
                         policy_data,
                         lambda_=10,
                         network=None):

        if expert_data.size(0) != policy_data.size(0):
            if expert_data.size(0) < policy_data.size(0):
                idx = np.random.permutation(
                    policy_data.size(0))[:expert_data.size(0)]
                policy_data = policy_data[idx, :]
            else:
                idx = np.random.permutation(
                    expert_data.size(0))[:policy_data.size(0)]
                expert_data = expert_data[idx, :]

        # # DRAGAN
        # alpha = torch.rand(expert_data.size()).to(expert_data.device)
        # mixup_data = alpha * expert_data + ((1 - alpha) * (expert_data + 0.5 * expert_data.std() * torch.rand(expert_data.size()).to(expert_data.device)))

        alpha = torch.rand(expert_data.size(0), 1)

        alpha = alpha.expand_as(expert_data).to(expert_data.device)
        mixup_data = alpha * expert_data + (1 - alpha) * policy_data

        mixup_data.requires_grad = True

        if network is None:
            network = self.trunk

        disc = network(mixup_data)
        ones = torch.ones(disc.size()).to(disc.device)
        grad = autograd.grad(outputs=disc,
                             inputs=mixup_data,
                             grad_outputs=ones,
                             create_graph=True,
                             retain_graph=True,
                             only_inputs=True)[0]

        grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
        return grad_pen

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.trunk.eval()

            d = self.trunk.reward(sa_cat(state, action))

            if self.ail_saturate == 1:
                reward = self.adversarial_loss.reward(
                    d * self.label_policy,
                    reduction=False)  # saturate  (positive)
            elif self.ail_saturate == -1:
                reward = -self.adversarial_loss.reward(
                    d * self.label_expert,
                    reduction=False)  # non-saturate (negative)
            elif self.ail_saturate == 0:
                reward = d

            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            if self.reward_std:
                return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
            else:
                return reward

    def update(self, rollouts, obsfilt=None):
        self.trunk.train()

        rollouts_size = rollouts.get_batch_size()
        policy_mini_batch_size = self.gail_batch_size if rollouts_size > self.gail_batch_size \
                            else rollouts_size

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=policy_mini_batch_size)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(self.expert_loader,
                                              policy_data_generator):

            policy_state, policy_action = policy_batch[0], policy_batch[2]
            expert_state, expert_action = expert_batch[0], expert_batch[1]

            # need to normalize the expert data using current policy statistics so that expert and policy data have the same normalization.
            if obsfilt is not None:
                expert_state = obsfilt(expert_state.numpy(), update=False)
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)

            policy_d = self.trunk(sa_cat(policy_state, policy_action))
            expert_d = self.trunk(sa_cat(expert_state, expert_action))
            grad_pen = self.compute_grad_pen(
                sa_cat(expert_state, expert_action),
                sa_cat(policy_state, policy_action), self.gp_lambda)

            policy_loss = self.adversarial_loss(policy_d * self.label_policy)
            expert_loss = self.adversarial_loss(expert_d * self.label_expert)

            gail_loss = expert_loss + policy_loss

            loss += (gail_loss + grad_pen).item()
            n += 1

            self.optimizer.zero_grad()
            (gail_loss + grad_pen).backward()
            self.optimizer.step()

        return loss / n
Ejemplo n.º 27
0
class DRIL:
    def __init__(self, device=None, envs=None, ensemble_policy=None, env_name=None,
        expert_dataset=None, ensemble_size=None, ensemble_quantile_threshold=None,
        dril_bc_model=None, dril_cost_clip=None, num_dril_bc_train_epoch=None,\
        training_data_split=None):

        self.ensemble_quantile_threshold = ensemble_quantile_threshold
        self.dril_cost_clip = dril_cost_clip
        self.device = device
        self.num_dril_bc_train_epoch = num_dril_bc_train_epoch
        self.env_name = env_name
        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())
        self.observation_space = envs.observation_space

        if envs.action_space.__class__.__name__ == "Discrete":
            self.num_actions = envs.action_space.n
        elif envs.action_space.__class__.__name__ == "Box":
            self.num_actions = envs.action_space.shape[0]
        elif envs.action_space.__class__.__name__ == "MultiBinary":
            self.num_actions = envs.action_space.shape[0]

        self.ensemble_size = ensemble_size
        # use full data since we don't use a validation set
        self.trdata = expert_dataset.load_demo_data(
            1.0, 1, self.ensemble_size)['trdata']

        self.ensemble = ensemble_policy
        self.bc = dril_bc_model
        self.bc.num_batches = num_dril_bc_train_epoch
        self.clip_variance = self.policy_variance(envs=envs)

    def policy_variance(self, q=0.98, envs=None):
        q = self.ensemble_quantile_threshold
        obs = None
        acs = None

        variance = defaultdict(lambda: [])
        for batch_idx, batch in enumerate(self.trdata):
            (state, action) = batch
            action = action.float().to(self.device)

            # Image observation
            if len(self.observation_space.shape) == 3:
                state = state.repeat(self.ensemble_size, 1, 1,
                                     1).float().to(self.device)
            # Feature observations
            else:
                state = state.repeat(self.ensemble_size,
                                     1).float().to(self.device)

            if isinstance(envs.action_space, gym.spaces.discrete.Discrete):
                # Note: this is just a place holder
                action_idx = int(action.item())
                one_hot_action = torch.FloatTensor(
                    np.eye(self.num_actions)[int(action.item())])
                action = one_hot_action
            elif envs.action_space.__class__.__name__ == "MultiBinary":
                # create unique id for each combination
                action_idx = int(
                    "".join(str(int(x)) for x in action[0].tolist()), 2)
            else:
                action_idx = 0

            with torch.no_grad():
                ensemble_action = self.ensemble(state).squeeze()
            if isinstance(envs.action_space, gym.spaces.Box):
                action = torch.clamp(action, envs.action_space.low[0],
                                     envs.action_space.high[0])

                ensemble_action = torch.clamp(ensemble_action, envs.action_space.low[0],\
                                            envs. action_space.high[0])

            cov = np.cov(ensemble_action.T.cpu().numpy())
            action = action.cpu().numpy()

            # If the env has only one action then we need to reshape cov
            if envs.action_space.__class__.__name__ == "Box":
                if envs.action_space.shape[0] == 1:
                    cov = cov.reshape(-1, 1)

            #variance.append(np.matmul(np.matmul(action, cov), action.T).item())
            if isinstance(envs.action_space, gym.spaces.discrete.Discrete):
                for action_idx in range(envs.action_space.n):
                    one_hot_action = torch.FloatTensor(
                        np.eye(self.num_actions)[action_idx])
                    variance[action_idx].append(
                        np.matmul(np.matmul(one_hot_action, cov),
                                  one_hot_action.T).item())
            else:
                variance[action_idx].append(
                    np.matmul(np.matmul(action, cov), action.T).item())

        quantiles = {
            key: np.quantile(np.array(variance[key]), q)
            for key in list(variance.keys())
        }
        if self.dril_cost_clip == '-1_to_1':
            return {
                key: lambda x: -1 if x > quantiles[key] else 1
                for key in list(variance.keys())
            }
        elif self.dril_cost_clip == 'no_clipping':
            return {key: lambda x: x for i in list(variance.keys())}
        elif self.dril_cost_clip == '-1_to_0':
            return {
                key: lambda x: -1 if x > quantiles[key] else 0
                for key in list(variance.keys())
            }

    def predict_reward(self, actions, states, envs):
        rewards = []
        for idx in range(actions.shape[0]):

            # Image observation
            if len(self.observation_space.shape) == 3:
                state = states[[idx]].repeat(self.ensemble_size, 1, 1,
                                             1).float().to(self.device)
            # Feature observations
            else:
                state = states[[idx]].repeat(self.ensemble_size,
                                             1).float().to(self.device)

            if isinstance(envs.action_space, gym.spaces.discrete.Discrete):
                one_hot_action = torch.FloatTensor(
                    np.eye(self.num_actions)[int(actions[idx].item())])
                action = one_hot_action
                action_idx = int(actions[idx].item())
            elif isinstance(envs.action_space, gym.spaces.Box):
                action = actions[[idx]]
                action_idx = 0
            elif isinstance(envs.action_space, gym.spaces.MultiBinary):
                raise Exception('Envrionment shouldnt be MultiBinary')
            else:
                raise Exception("Unknown Action Space")

            with torch.no_grad():
                ensemble_action = self.ensemble(state).squeeze().detach()

            if isinstance(envs.action_space, gym.spaces.Box):
                action = torch.clamp(action, envs.action_space.low[0],
                                     envs.action_space.high[0])
                ensemble_action = torch.clamp(ensemble_action, envs.action_space.low[0],\
                                            envs. action_space.high[0])

            cov = np.cov(ensemble_action.T.cpu().numpy())
            action = action.cpu().numpy()

            # If the env has only one action then we need to reshape cov
            if envs.action_space.__class__.__name__ == "Box":
                if envs.action_space.shape[0] == 1:
                    cov = cov.reshape(-1, 1)

            ensemble_variance = (np.matmul(np.matmul(action, cov),
                                           action.T).item())

            if action_idx in self.clip_variance:
                reward = self.clip_variance[action_idx](ensemble_variance)
            else:
                reward = -1
            rewards.append(reward)
        return torch.FloatTensor(np.array(rewards)[np.newaxis].T)

    def normalize_reward(self,
                         state,
                         action,
                         gamma,
                         masks,
                         reward,
                         update_rms=True):
        if self.returns is None:
            self.returns = reward.clone()

        if update_rms:
            self.returns = self.returns * masks * gamma + reward
            self.ret_rms.update(self.returns.cpu().numpy())

        return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)

    def bc_update(self):
        for dril_epoch in range(self.num_dril_bc_train_epoch):
            dril_train_loss = self.bc.update(update=True,
                                             data_loader_type='train')
Ejemplo n.º 28
0
class MLPBase(NNBase):
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=64,
                 recurrent=False,
                 device='cpu'):
        super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size)

        self.device = device

        if recurrent:
            num_inputs = hidden_size

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(nn.Linear(num_inputs + action_space.shape[0], hidden_size)),
            nn.Tanh(), init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
            init__(nn.Linear(hidden_size, 1)))

        self.optimizer = torch.optim.Adam(self.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.train()

    def compute_grad_pen(self,
                         expert_state,
                         expert_action,
                         policy_state,
                         policy_action,
                         lambda_=10):
        alpha = torch.rand(expert_state.size(0), 1)
        expert_data = torch.cat([expert_state, expert_action], dim=1)
        policy_data = torch.cat([policy_state, policy_action], dim=1)

        alpha = alpha.expand_as(expert_data).to(expert_data.device)

        mixup_data = alpha * expert_data + (1 - alpha) * policy_data
        mixup_data.requires_grad = True

        disc = self.trunk(mixup_data)
        ones = torch.ones(disc.size()).to(disc.device)
        grad = autograd.grad(outputs=disc,
                             inputs=mixup_data,
                             grad_outputs=ones,
                             create_graph=True,
                             retain_graph=True,
                             only_inputs=True)[0]

        grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
        return grad_pen

    def update(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action = policy_batch[0], policy_batch[2]

            policy_d = self.trunk(
                torch.cat([policy_state, policy_action], dim=1))

            expert_state, expert_action = expert_batch
            expert_state = obsfilt(expert_state.numpy(), update=False)
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_d = self.trunk(
                torch.cat([expert_state, expert_action], dim=1))

            # expert_loss = F.binary_cross_entropy_with_logits(
            #     expert_d,
            #     torch.ones(expert_d.size()).to(self.device))
            # policy_loss = F.binary_cross_entropy_with_logits(
            #     policy_d,
            #     torch.zeros(policy_d.size()).to(self.device))

            expert_loss = -expert_d.mean()

            policy_loss = policy_d.mean()

            gail_loss = expert_loss + policy_loss

            grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                             policy_state, policy_action)

            loss += (gail_loss + grad_pen).item()

            n += 1
            # before = list(self.parameters())[0].sum().clone()
            self.optimizer.zero_grad()
            (gail_loss + grad_pen).backward()
            self.optimizer.step()
            # after = list(self.parameters())[0].sum().clone()
            # print(after, before)
        return loss / n

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            # 0 for expert-like states, goes to -inf for non-expert-like states
            # compatible with envs with traj cutoffs for good (expert-like) behavior
            # e.g. mountain car, which gets cut off when the car reaches the destination
            # s = torch.sigmoid(d)

            # 0 for non-expert-like states, goes to +inf for expert-like states
            # compatible with envs with traj cutoffs for bad (non-expert-like) behavior
            # e.g. walking simulations that get cut off when the robot falls over
            # s = -(1. - torch.sigmoid(d))

            # reward = s.log() - (1 - s).log()

            reward = d

            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
Ejemplo n.º 29
0
class PpoOptimizer(object):
    envs = None

    def __init__(self, *, hps, scope, ob_space, ac_space, stochpol,
                 ent_coef, gamma, gamma_ext, lam, nepochs, lr, cliprange,
                 nminibatches,
                 normrew, normadv, use_news, ext_coeff, int_coeff,
                 nsteps_per_seg, nsegs_per_env, dynamics):
        self.dynamics = dynamics
        with tf.variable_scope(scope):
            self.hps = hps
            self.use_recorder = True
            self.n_updates = 0
            self.scope = scope
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.stochpol = stochpol
            self.nepochs = nepochs
            self.lr = lr
            self.cliprange = cliprange
            self.nsteps_per_seg = nsteps_per_seg
            self.nsegs_per_env = nsegs_per_env
            self.nminibatches = nminibatches
            self.gamma = gamma
            self.gamma_ext = gamma_ext
            self.lam = lam
            self.normrew = normrew
            self.normadv = normadv
            self.use_news = use_news
            self.ext_coeff = ext_coeff
            self.int_coeff = int_coeff

            self.ph_adv = tf.placeholder(tf.float32, [None, None])        
     
            self.ph_ret_int = tf.placeholder(tf.float32, [None, None])            
            self.ph_ret_ext = tf.placeholder(tf.float32, [None, None])            
            self.ph_ret = tf.placeholder(tf.float32, [None, None])

            self.ph_rews = tf.placeholder(tf.float32, [None, None])
            self.ph_oldnlp = tf.placeholder(tf.float32, [None, None])
            self.ph_oldvpred = tf.placeholder(tf.float32, [None, None])
            self.ph_lr = tf.placeholder(tf.float32, [])
            self.ph_cliprange = tf.placeholder(tf.float32, [])
            neglogpac = self.stochpol.pd.neglogp(self.stochpol.ph_ac)
            entropy = tf.reduce_mean(self.stochpol.pd.entropy())
            
            vpred = self.stochpol.vpred
            
            if hps['num_vf']==2: 
                # Separate vf_loss for intrinsic and extrinsic rewards
                vf_loss_int = 0.5 * tf.reduce_mean(tf.square(self.stochpol.vpred_int - self.ph_ret_int))
                vf_loss_ext = 0.5 * tf.reduce_mean(tf.square(self.stochpol.vpred_ext - self.ph_ret_ext))
                vf_loss = vf_loss_int + vf_loss_ext
            else:
                vf_loss = 0.5 * tf.reduce_mean((vpred - self.ph_ret) ** 2)

            ratio = tf.exp(self.ph_oldnlp - neglogpac)  # p_new / p_old
            negadv = - self.ph_adv
            pg_losses1 = negadv * ratio
            pg_losses2 = negadv * tf.clip_by_value(ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange)
            pg_loss_surr = tf.maximum(pg_losses1, pg_losses2)
            pg_loss = tf.reduce_mean(pg_loss_surr)
            ent_loss = (- ent_coef) * entropy
            approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.ph_oldnlp))
            clipfrac = tf.reduce_mean(tf.to_float(tf.abs(pg_losses2 - pg_loss_surr) > 1e-6))

            self.total_loss = pg_loss + ent_loss + vf_loss
            self.to_report = {'tot': self.total_loss, 'pg': pg_loss, 'vf': vf_loss, 'ent': entropy,
                              'approxkl': approxkl, 'clipfrac': clipfrac}

    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
            l in range(self.nlump)]

        self.rollout = Rollout(hps=self.hps, ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_advs_int = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_advs_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets_int = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32)


        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()

    def stop_interaction(self):
        for env in self.envs:
            env.close()


    def update(self):
        # Rewards normalization
        # if self.normrew:
        #     rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T])
        #     rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
        #     self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
        #     rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        
        # Intrinsic Rewards Normalization
        if self.normrew:
            rffs_int = np.array([self.rff.update(rew) for rew in self.rollout.buf_int_rews.T])
            self.rff_rms.update(rffs_int.ravel())        
            int_rews = self.rollout.buf_int_rews / np.sqrt(self.rff_rms.var)
        else:
            int_rews = np.copy(self.rollout.buf_int_rews)
        
        mean_int_rew = np.mean(int_rews)
        max_int_rew = np.max(int_rews)
        
        # Do not normalize extrinsic rewards 
        ext_rews = self.rollout.buf_ext_rews

        nsteps = self.rollout.nsteps

        # If separate value fcn are used
        if self.hps['num_vf']==2:
            #Calculate intrinsic returns and advantages.
            lastgaelam = 0
            for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
                if self.use_news:
                    nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
                else:
                    nextnew = 0 # No dones for intrinsic rewards with self.use_news=False
                nextvals = self.rollout.buf_vpreds_int[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_int_last
                nextnotnew = 1 - nextnew
                delta = int_rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds_int[:, t]
                self.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
            self.buf_rets_int[:] = self.buf_advs_int + self.rollout.buf_vpreds_int

            #Calculate extrinsic returns and advantages.
            lastgaelam = 0

            for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
                nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
                nextvals = self.rollout.buf_vpreds_ext[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_ext_last
                nextnotnew = 1 - nextnew
                delta = ext_rews[:, t] + self.gamma_ext * nextvals * nextnotnew - self.rollout.buf_vpreds_ext[:, t]
                self.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam
            self.buf_rets_ext[:] = self.buf_advs_ext + self.rollout.buf_vpreds_ext
            
            #Combine the extrinsic and intrinsic advantages.
            self.buf_advs = self.int_coeff*self.buf_advs_int + self.ext_coeff*self.buf_advs_ext
        else:
            #Calculate mixed intrinsic and extrinsic returns and advantages.
            rews = self.rollout.buf_rews = self.rollout.reward_fun(int_rew=int_rews, ext_rew=ext_rews)            
            lastgaelam = 0
            for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
                nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
                nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last
                nextnotnew = 1 - nextnew
                delta = rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t]
                self.buf_advs[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
            self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds
        
        info = dict(
            # advmean=self.buf_advs.mean(),
            # advstd=self.buf_advs.std(),  
            recent_best_ext_ret=self.rollout.current_max,
            recent_best_eplen = self.rollout.current_minlen,
            recent_worst_eplen = self.rollout.current_maxlen   
        )

        if self.hps['num_vf'] ==2:
            info['retmean_int']=self.buf_rets_int.mean()
            info['retmean_ext']=self.buf_rets_ext.mean()
            info['retstd_int']=self.buf_rets_int.std()
            info['retstd_ext']=self.buf_rets_ext.std()
            info['vpredmean_int']=self.rollout.buf_vpreds_int.mean()
            info['vpredmean_ext']=self.rollout.buf_vpreds_ext.mean()
            info['vpredstd_int']=self.rollout.buf_vpreds_int.std()
            info['vpredstd_ext']=self.rollout.buf_vpreds_ext.std()
            info['ev_int']=explained_variance(self.rollout.buf_vpreds_int.ravel(), self.buf_rets_int.ravel())            
            info['ev_ext']=explained_variance(self.rollout.buf_vpreds_ext.ravel(), self.buf_rets_ext.ravel())            
            info['rew_int_mean']=mean_int_rew
            info['recent_best_int_rew']=max_int_rew
        else:
            # info['retmean']=self.buf_rets.mean()
            # info['retstd']=self.buf_rets.std()
            # info['vpredmean']=self.rollout.buf_vpreds.mean()
            # info['vpredstd']=self.rollout.buf_vpreds.std()
            info['rew_mean']=np.mean(self.rollout.buf_rews)
            info['eplen_std']=np.std(self.rollout.statlists['eplen'])            
            info['eprew_std']=np.std(self.rollout.statlists['eprew'])
            # info['ev']=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel())            

        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret
            info['best_eplen'] = self.rollout.best_eplen

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:])
        
        #Create feed_dict for optimization.
        ph_buf = [
                (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
                (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
                (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),
                (self.ph_adv, resh(self.buf_advs)),
                ]

        if self.hps['num_vf']==2:
            ph_buf.extend([                
                (self.ph_ret_int, resh(self.buf_rets_int)),
                (self.ph_ret_ext, resh(self.buf_rets_ext)),
            ])       
        else:
            ph_buf.extend([
                (self.ph_rews, resh(self.rollout.buf_rews)),
                (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
                (self.ph_ret, resh(self.buf_rets)),
            ])

        ph_buf.extend([
            (self.dynamics.last_ob,
             self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape]))
        ])

        #Optimizes on current data for several epochs.
        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}                
                fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})
                mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1])

        mblossvals = [mblossvals[0]]
        # info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0)))
        # info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()})
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        # info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        # info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info

    def step(self):
        self.rollout.collect_rollout()
        update_info = self.update()
        return {'update': update_info}

    def get_var_values(self):
        return self.stochpol.get_var_values()

    def set_var_values(self, vv):
        self.stochpol.set_var_values(vv)
Ejemplo n.º 30
0
class CNNBase(NNBase):
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=512,
                 recurrent=False,
                 device='cpu'):

        super(CNNBase, self).__init__(recurrent, num_inputs, hidden_size)

        self.device = device
        self.action_space = action_space

        h, w = input_size
        self.conv1 = nn.Conv2d(num_inputs, 32, kernel_size=8, stride=4)
        w_out = conv2d_size_out(w, kernel_size=8, stride=4)
        h_out = conv2d_size_out(h, kernel_size=8, stride=4)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        w_out = conv2d_size_out(w_out, kernel_size=4, stride=2)
        h_out = conv2d_size_out(h_out, kernel_size=4, stride=2)

        self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1)
        w_out = conv2d_size_out(w_out, kernel_size=3, stride=1)
        h_out = conv2d_size_out(h_out, kernel_size=3, stride=1)

        init_cnn_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                   constant_(x, 0),
                                   nn.init.calculate_gain('relu'))

        self.cnn_trunk = nn.Sequential(
            init_cnn_(self.conv1), nn.ReLU(), init_cnn_(self.conv2), nn.ReLU(),
            init_cnn_(self.conv3), nn.ReLU(), Flatten(),
            init_cnn_(nn.Linear(32 * h_out * w_out, hidden_size)), nn.ReLU())

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(
                nn.Linear(hidden_size + self.action_space.n,
                          hidden_size // 2)), nn.Tanh(),
            init__(nn.Linear(hidden_size // 2, hidden_size // 2)), nn.Tanh(),
            init__(nn.Linear(hidden_size // 2, 1)))

        self.optimizer = torch.optim.Adam(self.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

    def update(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):

            policy_state, policy_action = policy_batch[0], policy_batch[2]
            policy_state_embedding = self.cnn_trunk(policy_state / 255.0)
            policy_d = self.trunk(
                torch.cat([
                    policy_state_embedding,
                    torch.nn.functional.one_hot(
                        policy_action, self.action_space.n).squeeze(1).float()
                ],
                          dim=1))

            expert_state, expert_action = expert_batch
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_state_embedding = self.cnn_trunk(expert_state / 255.0)
            expert_d = self.trunk(
                torch.cat([expert_state_embedding, expert_action], dim=1))

            expert_loss = F.binary_cross_entropy_with_logits(
                expert_d,
                torch.ones(expert_d.size()).to(self.device))
            policy_loss = F.binary_cross_entropy_with_logits(
                policy_d,
                torch.zeros(policy_d.size()).to(self.device))

            gail_loss = expert_loss + policy_loss
            loss += gail_loss.item()
            n += 1

            self.optimizer.zero_grad()
            gail_loss.backward()
            self.optimizer.step()
        return loss / n

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            state_embedding = self.cnn_trunk(state / 255.)
            d = self.trunk(
                torch.cat([
                    state_embedding,
                    torch.nn.functional.one_hot(
                        action, self.action_space.n).squeeze(1).float()
                ],
                          dim=1))
            s = torch.sigmoid(d)
            reward = s.log() - (1 - s).log()
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
Ejemplo n.º 31
0
class VecNormalize(ABC):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        self.venv = venv

        self.num_envs = venv.num_envs
        self.observation_space = venv.observation_space
        self.action_space = venv.action_space

        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)

    def close(self):
        if self.closed:
            return
        if self.viewer is not None:
            self.viewer.close()
        self.close_extras()
        self.closed = True

    def step(self, actions):
        """
        Step the environments synchronously.

        This is available for backwards compatibility.
        """
        self.step_async(actions)
        return self.step_wait()
class Runner(AbstractEnvRunner):
    """
    We use this object to make a mini batch of experiences
    __init__:
    - Initialize the runner

    run():
    - Make a mini batch
    """
    def __init__(self, *, env, model, nsteps, gamma, lam):
        super().__init__(env=env, model=model, nsteps=nsteps)
        # Lambda used in GAE (General Advantage Estimation)
        self.lam = lam
        # Discount rate
        self.gamma = gamma
        self.clipob = 10.
        self.cliprew = 10.
        self.epsilon = 1e-8
        self.ret = 0
        self.ob_rms = RunningMeanStd(shape=self.env.observation_space.shape)
        self.ret_rms = RunningMeanStd(shape=())

    def obfilt(self, obs):
        obs = np.clip(
            (obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon),
            -self.clipob, self.clipob)
        return obs

    def rewfilt(self, rews):
        rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                       -self.cliprew, self.cliprew)
        return rews

    def run(self):
        # Here, we init the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs , mb_means , mb_logstds = [],[],[],[],[],[],[],[]
        mb_states = self.states
        epinfos = []
        # For n in range number of steps
        for _ in range(self.nsteps):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            actions, values, self.states, neglogpacs = self.model.step(
                self.obfilt(self.obs), S=self.states, M=self.dones)
            means, logstds = self.model.meanlogstd(self.obfilt(self.obs))
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones)
            mb_means.append(means)
            mb_logstds.append(logstds)

            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            self.obs[:], rewards, self.dones, infos = self.env.step(actions)
            self.ob_rms.update(self.obs)
            self.ret = self.ret * self.gamma + rewards
            self.ret_rms.update(self.ret)
            if self.dones:
                self.ret = 0

            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_means = np.asarray(mb_means)
        mb_logstds = np.asarray(mb_logstds)
        # mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        # last_values = self.model.value(self.obs, S=self.states, M=self.dones)
        # discount/bootstrap off value fn
        # mb_returns = np.zeros_like(mb_rewards)
        # mb_advs = np.zeros_like(mb_rewards)
        # lastgaelam = 0
        # for t in reversed(range(self.nsteps)):
        #     if t == self.nsteps - 1:
        #         nextnonterminal = 1.0 - self.dones
        #         nextvalues = last_values
        #     else:
        #         nextnonterminal = 1.0 - mb_dones[t+1]
        #         nextvalues = mb_values[t+1]
        #     delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
        #     mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
        # mb_returns = mb_advs + mb_values
        return (*map(sf01,
                     (mb_obs, mb_rewards, mb_dones, mb_actions, mb_neglogpacs,
                      mb_means, mb_logstds)), self.obs, self.dones, epinfos)