コード例 #1
0
    def learn(self, D, **kwargs):
        logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D]
        entropies = [torch.cat(traj.get_infos('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_infos('V')) for traj in D]
        last_Vs = [traj.extra_info['last_info']['V'] for traj in D]
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj.rewards,
                                 last_V, traj.reach_terminal)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj.rewards, V, last_V, traj.reach_terminal)
            for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-4)
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As.detach()
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')
        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.total_timestep += sum([traj.T for traj in D])

        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -out['entropy_loss']
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
コード例 #2
0
def td0_target(gamma, rewards, Vs, last_V, reach_terminal):
    r"""Calculate TD(0) targets of a batch of episodic transitions. 
    
    Let :math:`r_1, r_2, \dots, r_T` be a list of rewards and let :math:`V(s_0), V(s_1), \dots, V(s_{T-1}), V(s_{T})`
    be a list of state values including a last state value. Let :math:`\gamma` be a discounted factor, 
    the TD(0) targets are calculated as follows
        
    .. math::
        r_t + \gamma V(s_t), \forall t = 1, 2, \dots, T
        
    .. note::

        The state values for terminal states are masked out as zero !
    
    """
    rewards = numpify(rewards, np.float32)
    Vs = numpify(Vs, np.float32)
    last_V = numpify(last_V, np.float32)

    if reach_terminal:
        Vs = np.append(Vs, 0.0)
    else:
        Vs = np.append(Vs, last_V)
    out = rewards + gamma * Vs[1:]
    return out.astype(np.float32)
コード例 #3
0
ファイル: test_metric.py プロジェクト: zuoxingdong/lagom
def test_vtrace(gamma, last_V, reach_terminal, clip_rho, clip_pg_rho):
    behavior_logprobs = [1, 2, 3]
    target_logprobs = [4, 5, 6]
    Rs = [7, 8, 9]
    Vs = [10, 11, 12]

    vs_test, As_test = vtrace(behavior_logprobs, target_logprobs, gamma, Rs,
                              Vs, last_V, reach_terminal, clip_rho,
                              clip_pg_rho)

    # ground truth calculation
    behavior_logprobs = numpify(behavior_logprobs, np.float32)
    target_logprobs = numpify(target_logprobs, np.float32)
    Rs = numpify(Rs, np.float32)
    Vs = numpify(Vs, np.float32)
    last_V = numpify(last_V, np.float32)

    rhos = np.exp(target_logprobs - behavior_logprobs)
    clipped_rhos = np.minimum(clip_rho, rhos)
    cs = np.minimum(1.0, rhos)
    deltas = clipped_rhos * td0_error(gamma, Rs, Vs, last_V, reach_terminal)

    vs = np.array([
        Vs[0] + gamma**0 * 1 * deltas[0] + gamma * cs[0] * deltas[1] +
        gamma**2 * cs[0] * cs[1] * deltas[2],
        Vs[1] + gamma**0 * 1 * deltas[1] + gamma * cs[1] * deltas[2],
        Vs[2] + gamma**0 * 1 * deltas[2]
    ])
    vs_next = np.append(vs[1:], (1. - reach_terminal) * last_V)
    clipped_pg_rhos = np.minimum(clip_pg_rho, rhos)
    As = clipped_pg_rhos * (Rs + gamma * vs_next - Vs)

    assert np.allclose(vs, vs_test)
    assert np.allclose(As, As_test)
コード例 #4
0
ファイル: agent.py プロジェクト: zuoxingdong/lagom
 def choose_action(self, x, **kwargs):
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     with torch.no_grad():
         if kwargs['mode'] == 'train':
             action = numpify(self.actor(obs).sample(), 'float')
         elif kwargs['mode'] == 'eval':
             action = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
     out = {}
     out['raw_action'] = action.squeeze(0)
     return out
コード例 #5
0
ファイル: agent.py プロジェクト: zuoxingdong/lagom
    def learn_one_update(self, data):
        data = [d.detach().to(self.device) for d in data]
        observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data

        action_dist = self.policy(observations)
        logprobs = action_dist.log_prob(old_actions).squeeze()
        entropies = action_dist.entropy().squeeze()
        Vs = self.value(observations).squeeze()
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs]])

        ratio = torch.exp(logprobs - old_logprobs)
        eps = self.config['agent.clip_range']
        policy_loss = -torch.min(
            ratio * old_As,
            torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * old_As)
        policy_loss = policy_loss.mean(0)

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_grad_norm = nn.utils.clip_grad_norm_(
            self.policy.parameters(), self.config['agent.max_grad_norm'])
        self.policy_optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.policy_lr_scheduler.step(self.total_timestep)

        clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps)
        value_loss = torch.max(
            F.mse_loss(Vs, old_Qs, reduction='none'),
            F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
        value_loss = value_loss.mean(0)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        value_grad_norm = nn.utils.clip_grad_norm_(
            self.value.parameters(), self.config['agent.max_grad_norm'])
        self.value_optimizer.step()

        out = {}
        out['policy_grad_norm'] = policy_grad_norm
        out['value_grad_norm'] = value_grad_norm
        out['policy_loss'] = policy_loss.item()
        out['policy_entropy'] = entropies.mean().item()
        out['value_loss'] = value_loss.item()
        out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        out['approx_kl'] = (old_logprobs - logprobs).mean(0).item()
        out['clip_frac'] = ((ratio < 1.0 - eps) |
                            (ratio > 1.0 + eps)).float().mean(0).item()
        return out
コード例 #6
0
 def choose_action(self, obs, **kwargs):
     obs = tensorify(obs, self.device)
     out = {}
     if kwargs['mode'] == 'train':
         dist = self.actor(obs)
         action = dist.rsample()
         out['action'] = action
         out['action_logprob'] = dist.log_prob(action)
     elif kwargs['mode'] == 'stochastic':
         with torch.no_grad():
             out['action'] = numpify(self.actor(obs).sample(), 'float')
     elif kwargs['mode'] == 'eval':
         with torch.no_grad():
             out['action'] = numpify(
                 torch.tanh(self.actor.mean_forward(obs)), 'float')
     else:
         raise NotImplementedError
     return out
コード例 #7
0
ファイル: agent.py プロジェクト: zuoxingdong/lagom
 def choose_action(self, x, **kwargs):
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     features = self.feature_network(obs)
     action_dist = self.action_head(features)
     action = action_dist.sample()
     out = {}
     out['raw_action'] = numpify(action,
                                 self.env.action_space.dtype).squeeze(0)
     return out
コード例 #8
0
def test_numpify():
    # tensor
    x = torch.tensor(2.43)
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    x = torch.randn(10)
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    x = torch.randn(10, 20, 30)
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    # ndarray
    x = np.array(2.43)
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    x = np.random.randn(10)
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    x = np.random.randn(10, 20, 30)
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    # raw list
    x = [2.43]
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    x = [1, 2, 3, 4, 5, 6]
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y

    x = [[1, 2], [3, 4], [5, 6]]
    y = numpify(x, np.float32)
    assert np.allclose(x, y)
    del x, y
コード例 #9
0
ファイル: agent.py プロジェクト: ludwigwinkler/lagom
 def choose_action(self, obs, **kwargs):
     obs = tensorify(obs, self.device)
     with torch.no_grad():
         action = numpify(self.actor(obs), 'float')
     if kwargs['mode'] == 'train':
         eps = np.random.normal(0.0, self.config['agent.action_noise'], size=action.shape)
         action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high)
     out = {}
     out['action'] = action
     return out
コード例 #10
0
    def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        features = self.feature_network(obs)

        action_dist = self.action_head(features)
        out['entropy'] = action_dist.entropy()
        action = action_dist.sample()
        out['raw_action'] = numpify(action, 'float')
        return out
コード例 #11
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        episode_length = kwargs['episode_length']
        out = {}
        out['actor_loss'] = []
        out['critic_loss'] = []
        Q_vals = []
        for i in range(episode_length):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs = self.critic(observations, actions).squeeze()
            with torch.no_grad():
                next_Qs = self.critic_target(
                    next_observations,
                    self.actor_target(next_observations)).squeeze()
            targets = rewards + self.config[
                'agent.gamma'] * masks * next_Qs.detach()

            critic_loss = F.mse_loss(Qs, targets)
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            actor_loss = -self.critic(observations,
                                      self.actor(observations)).mean()
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            actor_loss.backward()
            actor_grad_norm = nn.utils.clip_grad_norm_(
                self.actor.parameters(), self.config['agent.max_grad_norm'])
            self.actor_optimizer.step()

            self.polyak_update_target()

            out['actor_loss'].append(actor_loss)
            out['critic_loss'].append(critic_loss)
            Q_vals.append(Qs)
        out['actor_loss'] = torch.stack(out['actor_loss']).mean().item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.stack(out['critic_loss']).mean().item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q'] = describe_it(Q_vals)
        return out
コード例 #12
0
ファイル: agent.py プロジェクト: LorinChen/lagom
 def choose_action(self, x, **kwargs):
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     features = self.feature_network(obs)
     action_dist = self.action_head(features)
     V = self.V_head(features)
     action = action_dist.sample()
     out = {}
     out['action_dist'] = action_dist
     out['V'] = V
     out['entropy'] = action_dist.entropy()
     out['action'] = action
     out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0)
     out['action_logprob'] = action_dist.log_prob(action.detach())
     return out
コード例 #13
0
ファイル: vtrace.py プロジェクト: zuoxingdong/lagom
def vtrace(behavior_logprobs,
           target_logprobs,
           gamma,
           Rs,
           Vs,
           last_V,
           reach_terminal,
           clip_rho=1.0,
           clip_pg_rho=1.0):
    behavior_logprobs = numpify(behavior_logprobs, np.float32)
    target_logprobs = numpify(target_logprobs, np.float32)
    Rs = numpify(Rs, np.float32)
    Vs = numpify(Vs, np.float32)
    last_V = numpify(last_V, np.float32)
    assert all([
        item.ndim == 1
        for item in [behavior_logprobs, target_logprobs, Rs, Vs]
    ])
    assert np.isscalar(gamma)

    rhos = np.exp(target_logprobs - behavior_logprobs)
    clipped_rhos = np.minimum(clip_rho, rhos)
    cs = np.minimum(1.0, rhos)
    deltas = clipped_rhos * td0_error(gamma, Rs, Vs, last_V, reach_terminal)

    vs_minus_V = []
    total = 0.0
    for delta_t, c_t in zip(deltas[::-1], cs[::-1]):
        total = delta_t + gamma * c_t * total
        vs_minus_V.append(total)
    vs_minus_V = np.asarray(vs_minus_V)[::-1]

    vs = vs_minus_V + Vs
    vs_next = np.append(vs[1:], (1. - reach_terminal) * last_V)
    clipped_pg_rhos = np.minimum(clip_pg_rho, rhos)
    As = clipped_pg_rhos * (Rs + gamma * vs_next - Vs)
    return vs, As
コード例 #14
0
    def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}

        action_dist = self.policy(obs)
        out['action_dist'] = action_dist
        out['entropy'] = action_dist.entropy()

        action = action_dist.sample()
        out['action'] = action
        out['raw_action'] = numpify(action, 'float')
        out['action_logprob'] = action_dist.log_prob(action.detach())

        V = self.value(obs)
        out['V'] = V
        return out
コード例 #15
0
ファイル: agent.py プロジェクト: StanfordVL/lagom
    def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        features = self.feature_network(obs)

        action_dist = self.action_head(features)
        out['action_dist'] = action_dist
        out['entropy'] = action_dist.entropy()

        action = action_dist.sample()
        out['action'] = action
        out['raw_action'] = numpify(action, 'float')
        out['action_logprob'] = action_dist.log_prob(action.detach())

        V = self.V_head(features)
        out['V'] = V
        return out
コード例 #16
0
ファイル: agent_lstm.py プロジェクト: zuoxingdong/lagom
 def choose_action(self, x, **kwargs):
     if x.first():
         self.state = self.reset(1)
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     obs = obs.unsqueeze(0)  # add seq_dim
     features, [next_state] = self.feature_network(obs, [self.state])
     if 'last_info' not in kwargs:
         self.state = next_state
     features = features.squeeze(0)  # squeeze seq_dim
     action_dist = self.action_head(features)
     V = self.V_head(features)
     action = action_dist.sample()
     out = {}
     out['action_dist'] = action_dist
     out['V'] = V
     out['entropy'] = action_dist.entropy()
     out['action'] = action
     out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0)
     out['action_logprob'] = action_dist.log_prob(action.detach())
     return out
コード例 #17
0
def bootstrapped_returns(gamma, rewards, last_V, reach_terminal):
    r"""Return (discounted) accumulated returns with bootstrapping for a 
    batch of episodic transitions. 
    
    Formally, suppose we have all rewards :math:`(r_1, \dots, r_T)`, it computes
        
    .. math::
        Q_t = r_t + \gamma r_{t+1} + \dots + \gamma^{T - t} r_T + \gamma^{T - t + 1} V(s_{T+1})
        
    .. note::

        The state values for terminal states are masked out as zero !

    """
    last_V = numpify(last_V, np.float32).item()

    if reach_terminal:
        out = geometric_cumsum(gamma, np.append(rewards, 0.0))
    else:
        out = geometric_cumsum(gamma, np.append(rewards, last_V))
    return out[0, :-1].astype(np.float32)
コード例 #18
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        episode_length = kwargs['episode_length']
        out = {}
        out['actor_loss'] = []
        out['critic_loss'] = []
        out['alpha_loss'] = []
        Q1_vals = []
        Q2_vals = []
        logprob_vals = []
        for i in range(episode_length):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs1, Qs2 = self.critic(observations, actions)
            Qs1, Qs2 = map(lambda x: x.squeeze(-1), [Qs1, Qs2])
            with torch.no_grad():
                out_actor = self.choose_action(next_observations, mode='train')
                next_actions = out_actor['action']
                next_actions_logprob = out_actor['action_logprob']
                next_Qs1, next_Qs2 = self.critic_target(
                    next_observations, next_actions)
                next_Qs = torch.min(next_Qs1, next_Qs2).squeeze(
                    -1) - self.alpha.detach() * next_actions_logprob
                Q_targets = rewards + self.config[
                    'agent.gamma'] * masks * next_Qs

            critic_loss = F.mse_loss(Qs1, Q_targets.detach()) + F.mse_loss(
                Qs2, Q_targets.detach())
            print(critic_loss.item())  ############
            self.optimizer_zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            if i % self.config['agent.policy_delay'] == 0:
                out_actor = self.choose_action(observations, mode='train')
                policy_actions = out_actor['action']
                policy_actions_logprob = out_actor['action_logprob']

                actor_Qs1, actor_Qs2 = self.critic(observations,
                                                   policy_actions)
                actor_Qs = torch.min(actor_Qs1, actor_Qs2).squeeze(-1)
                actor_loss = torch.mean(self.alpha.detach() *
                                        policy_actions_logprob - actor_Qs)

                self.optimizer_zero_grad()
                actor_loss.backward()
                actor_grad_norm = nn.utils.clip_grad_norm_(
                    self.actor.parameters(),
                    self.config['agent.max_grad_norm'])
                self.actor_optimizer.step()

                alpha_loss = torch.mean(
                    self.log_alpha *
                    (-policy_actions_logprob - self.target_entropy).detach())

                self.optimizer_zero_grad()
                alpha_loss.backward()
                self.log_alpha_optimizer.step()

                self.polyak_update_target()

                out['actor_loss'].append(actor_loss)
                out['alpha_loss'].append(alpha_loss)
            out['critic_loss'].append(critic_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
            logprob_vals.append(policy_actions_logprob)
        out['actor_loss'] = torch.tensor(out['actor_loss']).mean().item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(out['critic_loss']).mean().item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        out['logprob'] = describe_it(logprob_vals)
        out['alpha_loss'] = torch.tensor(out['alpha_loss']).mean().item()
        out['alpha'] = self.alpha.item()
        return out
コード例 #19
0
ファイル: agent.py プロジェクト: zuoxingdong/lagom
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        Ts = [len(traj) for traj in D]
        behavior_logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        out_agent = self.choose_action(
            np.concatenate([traj.numpy_observations[:-1] for traj in D], 0))
        logprobs = out_agent['action_logprob'].squeeze()
        entropies = out_agent['entropy'].squeeze()
        Vs = out_agent['V'].squeeze()
        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)

        vs, As = [], []
        for traj, behavior_logprob, logprob, V, last_V in zip(
                D, behavior_logprobs,
                logprobs.detach().cpu().split(Ts),
                Vs.detach().cpu().split(Ts), last_Vs):
            v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards,
                          V, last_V, traj.reach_terminal, self.clip_rho,
                          self.clip_pg_rho)
            vs.append(v)
            As.append(A)

        # Metrics -> Tensor, device
        vs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [vs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, vs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(vs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
コード例 #20
0
ファイル: test_utils.py プロジェクト: zuoxingdong/lagom
def test_numpify():
    # Tensor
    x = torch.randn(5)
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert x.shape == y.shape
    del x, y

    x = torch.randn(5, 4)
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert x.shape == y.shape
    del x, y

    x = torch.randn(5, 4)
    y = numpify(x, dtype=np.float16)
    assert isinstance(y, np.ndarray)
    assert x.shape == y.shape
    assert y.dtype == np.float16
    del x, y

    # Array
    x = np.random.randn(5)
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert x.shape == y.shape
    del x, y

    x = np.random.randn(5, 4)
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert x.shape == y.shape
    del x, y

    x = np.random.randn(5, 4)
    y = numpify(x, dtype=np.float16)
    assert isinstance(y, np.ndarray)
    assert x.shape == y.shape
    assert y.dtype == np.float16
    del x, y

    # List
    x = [1, 2, 3, 4]
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert np.allclose(x, y)
    del x, y

    x = [[1.2, 2.3], [3.4, 4.5]]
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert np.allclose(x, y)
    del x, y

    # Tuple
    x = (1, 2, 3, 4)
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert np.allclose(x, y)
    del x, y

    x = ((1.2, 2.3), (3.4, 4.5))
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    assert np.allclose(x, y)
    del x, y

    # Scalar
    x = 1
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    del x, y

    # Bool
    x = True
    y = numpify(x)
    assert isinstance(y, np.ndarray)
    del x, y
コード例 #21
0
ファイル: agent.py プロジェクト: StanfordVL/lagom
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.optimizer.step()

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
コード例 #22
0
ファイル: td3_agent.py プロジェクト: zuoxingdong/lagom
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        T = kwargs['T']
        list_actor_loss = []
        list_critic_loss = []
        Q1_vals = []
        Q2_vals = []
        for i in range(T):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs1, Qs2 = self.critic(observations, actions)
            with torch.no_grad():
                next_actions = self.actor_target(next_observations)
                eps = torch.empty_like(next_actions).normal_(
                    0.0, self.config['agent.target_noise'])
                eps = eps.clamp(-self.config['agent.target_noise_clip'],
                                self.config['agent.target_noise_clip'])
                next_actions = torch.clamp(next_actions + eps,
                                           -self.max_action, self.max_action)
                next_Qs1, next_Qs2 = self.critic_target(
                    next_observations, next_actions)
                next_Qs = torch.min(next_Qs1, next_Qs2)
                targets = rewards + self.config['agent.gamma'] * masks * next_Qs
            critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss(
                Qs2, targets.detach())
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            if i % self.config['agent.policy_delay'] == 0:
                actor_loss = -self.critic.Q1(observations,
                                             self.actor(observations)).mean()
                self.actor_optimizer.zero_grad()
                self.critic_optimizer.zero_grad()
                actor_loss.backward()
                actor_grad_norm = nn.utils.clip_grad_norm_(
                    self.actor.parameters(),
                    self.config['agent.max_grad_norm'])
                self.actor_optimizer.step()

                self.polyak_update_target()
                list_actor_loss.append(actor_loss)
            list_critic_loss.append(critic_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
        self.total_timestep += T

        out = {}
        out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        return out
コード例 #23
0
ファイル: agent.py プロジェクト: zuoxingdong/lagom
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        T = kwargs['T']
        list_actor_loss = []
        list_critic_loss = []
        list_alpha_loss = []
        Q1_vals = []
        Q2_vals = []
        logprob_vals = []
        for i in range(T):
            observations, actions, rewards, next_observations, masks = replay.sample(self.config['replay.batch_size'])
            
            Qs1, Qs2 = self.critic(observations, actions)
            with torch.no_grad():
                action_dist = self.actor(next_observations)
                next_actions = action_dist.rsample()
                next_actions_logprob = action_dist.log_prob(next_actions).unsqueeze(-1)
                next_Qs1, next_Qs2 = self.critic_target(next_observations, next_actions)
                next_Qs = torch.min(next_Qs1, next_Qs2) - self.alpha.detach()*next_actions_logprob
                targets = rewards + self.config['agent.gamma']*masks*next_Qs
            critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss(Qs2, targets.detach())
            self.optimizer_zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()
            
            action_dist = self.actor(observations)
            policy_actions = action_dist.rsample()
            policy_actions_logprob = action_dist.log_prob(policy_actions).unsqueeze(-1)
            actor_Qs1, actor_Qs2 = self.critic(observations, policy_actions)
            actor_Qs = torch.min(actor_Qs1, actor_Qs2)
            actor_loss = torch.mean(self.alpha.detach()*policy_actions_logprob - actor_Qs)
            self.optimizer_zero_grad()
            actor_loss.backward()
            actor_grad_norm = nn.utils.clip_grad_norm_(self.actor.parameters(), self.config['agent.max_grad_norm'])
            self.actor_optimizer.step()
            
            alpha_loss = torch.mean(self.log_alpha*(-policy_actions_logprob - self.target_entropy).detach())
            self.optimizer_zero_grad()
            alpha_loss.backward()
            self.log_alpha_optimizer.step()

            self.polyak_update_target()
            list_actor_loss.append(actor_loss)
            list_critic_loss.append(critic_loss)
            list_alpha_loss.append(alpha_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
            logprob_vals.append(policy_actions_logprob)
        self.total_timestep += T
        
        out = {}
        out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        out['logprob'] = describe_it(logprob_vals)
        out['alpha_loss'] = torch.tensor(list_alpha_loss).mean(0).item()
        out['alpha'] = self.alpha.item()
        return out