Esempio n. 1
0
class AsyncDDPGAgent:
    def __init__(self,
                 observation_space,
                 action_space,
                 discount=0.99,
                 td_lambda=0.95,
                 hidden_size=(128, 64),
                 temp=1.,
                 max_weight=20,
                 action_std=0.4,
                 actor_lr=0.0001,
                 critic_lr=0.01,
                 device='cpu',
                 batch_size=256,
                 pipe=None,
                 optimizer='SGD',
                 activation='relu'):

        self.device = device
        inp_dim = observation_space.shape[0]
        self.actor = actor(inp_dim,
                           action_space.low.shape[0],
                           std=action_std,
                           hidden_size=hidden_size,
                           activation=activation).to(device)
        self.critic = critic(inp_dim,
                             hidden_size=hidden_size,
                             activation=activation).to(device)
        self.normalizer = Normalizer((inp_dim, ),
                                     default_clip_range=5).to(device)
        self.normalizer.count += 1  #unbiased ...
        self.temp = temp
        self.max_weight = max_weight

        # NOTE: optimizer is different
        if optimizer == 'SGD':
            self.optim_actor = torch.optim.SGD(self.actor.parameters(),
                                               actor_lr,
                                               momentum=0.9)
            self.optim_critic = torch.optim.SGD(self.critic.parameters(),
                                                critic_lr,
                                                momentum=0.9)
        else:
            self.optim_actor = torch.optim.Adam(self.actor.parameters(),
                                                actor_lr)
            self.optim_critic = torch.optim.Adam(self.critic.parameters(),
                                                 critic_lr)
        self.pipe = pipe
        self.batch_size = batch_size
        self.mse = nn.MSELoss()

        self.discount = discount
        self.td_lambda = td_lambda
        self.val_norm = 1.0 / (1.0 - self.discount)

        self.action_mean = ((action_space.high + action_space.low) /
                            2)[None, :]
        self.action_std = ((action_space.high - action_space.low) / 2)[None, :]

    def set_params(self, params):
        assert isinstance(params, dict)
        _set_flat_params_or_grads(self.actor, params['actor'], mode='params'),
        _set_flat_params_or_grads(self.critic, params['critic'], mode='params')

    def get_params(self):
        return {
            'actor': _get_flat_params_or_grads(self.actor, mode='params'),
            'critic': _get_flat_params_or_grads(self.critic, mode='params')
        }

    def sync_grads(self, net, weight=None):
        # reweight the gradients to avoid any bias...
        grad = _get_flat_params_or_grads(net, mode='grad')
        if weight is not None:
            grad = grad * weight
        self.pipe.send(grad)
        grad = self.pipe.recv()
        _set_flat_params_or_grads(net, grad, mode='grad')

    def update_normalizer(self, obs):
        data = [obs.sum(axis=0), (obs**2).sum(axis=0), obs.shape[0]]
        self.pipe.send(data)
        s, sq, count = self.pipe.recv()
        self.normalizer.add(
            torch.tensor(s, dtype=torch.float32, device=self.device),
            torch.tensor(sq, dtype=torch.float32, device=self.device),
            torch.tensor(count, dtype=torch.long, device=self.device),
        )

    def tensor(self, x):
        return torch.tensor(x, dtype=torch.float).to(self.device)

    def as_state(self, s):
        return self.normalizer(self.tensor(s))

    def gen(self, sample_idx, steps, batch_size):
        while steps > 0:
            np.random.shuffle(sample_idx)
            for i in range(len(sample_idx) // batch_size):
                if steps <= 0:
                    break
                yield sample_idx[i * batch_size:(i + 1) * batch_size]
                steps -= 1

    def update_actor(self,
                     steps,
                     states,
                     normed_actions,
                     normed_advs,
                     sample_idx,
                     process_weight=None):
        # we assume all numpy states is not normalized ...
        #NOTE: it's better to have uniform sampling
        num_idx = len(states)
        for idx in self.gen(sample_idx, steps, self.batch_size):
            self.optim_actor.zero_grad()
            weights = np.minimum(np.exp(normed_advs[idx] / self.temp),
                                 self.max_weight)
            weights = self.tensor(weights)
            distribution = self.actor(self.as_state(states[idx]))
            logpi = -distribution.log_prob(self.tensor(
                normed_actions[idx])).sum(dim=-1)
            assert logpi.shape == weights.shape, f"logpi size: {logpi.shape}, weights size: {weights.shape}"
            actor_loss = (logpi * weights).mean()
            actor_loss.backward()

            self.sync_grads(self.actor, process_weight)
            self.optim_actor.step()

    def update_critic(self,
                      steps,
                      states,
                      targets,
                      sample_idx,
                      process_weight=None):
        num_idx = len(states)
        for idx in self.gen(sample_idx, steps, self.batch_size):
            self.optim_critic.zero_grad()

            normed_val = self.critic(self.as_state(states[idx]))
            normed_target = self.tensor(targets[idx]) / self.val_norm
            assert normed_val.shape == normed_target.shape
            critic_loss = self.mse(normed_val, normed_target)
            critic_loss.backward()

            self.sync_grads(self.critic, process_weight)
            self.optim_critic.step()

    def tocpu(self, x):
        return x.detach().cpu().numpy()

    def act(self, obs, mode='sample'):
        obs = self.as_state(obs)
        p = self.actor(obs)
        if mode == 'sample':
            a = p.sample()
        else:
            a = p.loc
        return self.tocpu(a) * self.action_std + self.action_mean

    def value(self, obs):
        # the value networks' output is unnormalized term ..
        return self.tocpu(self.critic(self.as_state(obs)) * self.val_norm)

    def update(self,
               buffer,
               critic_steps,
               actor_steps,
               ADV_EPS=1e-5,
               process_weight=None):
        states, actions, rewards, dones = [np.array(i) for i in buffer]
        # normalize action
        actions = (actions - self.action_mean) / self.action_std

        dones = dones.astype(np.bool)

        valid_mask = ~dones
        sample_idx = np.arange(len(states))[valid_mask]

        values = self.value(states)
        discount_return = self.discount_return(rewards, dones, values,
                                               self.discount, self.td_lambda)
        #print('critic', critic_steps, end='\n\n')

        self.update_critic(critic_steps,
                           states,
                           discount_return,
                           sample_idx,
                           process_weight=process_weight)
        #print('finish critic', end='\n\n')

        # normalize advantages..
        # we need to compute the value again
        values = self.value(states)
        discount_return = self.discount_return(rewards, dones, values,
                                               self.discount, self.td_lambda)
        adv = discount_return - values
        adv_valid = adv[valid_mask]
        adv_norm = (adv - adv_valid.mean()) / (adv_valid.std() + ADV_EPS)

        #print('actor', actor_steps)
        self.update_actor(actor_steps,
                          states,
                          actions,
                          adv_norm,
                          sample_idx,
                          process_weight=process_weight)
        #print('finish actor')

    def discount_return(self, reward, done, value, discount, td_lambda):
        num_step = len(value)
        return_t = np.zeros([num_step])
        nxt = 0
        for t in range(num_step - 1, -1, -1):
            if done[t]:
                #nxt = value[t]
                nxt = 0
            else:
                nxt_return = reward[t] + discount * nxt
                return_t[t] = nxt_return
                nxt = (1.0 - td_lambda) * value[t] + td_lambda * nxt_return
        return return_t

    def save(self, path):
        pipe = self.pipe
        self.pipe = None
        torch.save(self, path)
        self.pipe = pipe