class AsyncDDPGAgent: def __init__(self, observation_space, action_space, discount=0.99, td_lambda=0.95, hidden_size=(128, 64), temp=1., max_weight=20, action_std=0.4, actor_lr=0.0001, critic_lr=0.01, device='cpu', batch_size=256, pipe=None, optimizer='SGD', activation='relu'): self.device = device inp_dim = observation_space.shape[0] self.actor = actor(inp_dim, action_space.low.shape[0], std=action_std, hidden_size=hidden_size, activation=activation).to(device) self.critic = critic(inp_dim, hidden_size=hidden_size, activation=activation).to(device) self.normalizer = Normalizer((inp_dim, ), default_clip_range=5).to(device) self.normalizer.count += 1 #unbiased ... self.temp = temp self.max_weight = max_weight # NOTE: optimizer is different if optimizer == 'SGD': self.optim_actor = torch.optim.SGD(self.actor.parameters(), actor_lr, momentum=0.9) self.optim_critic = torch.optim.SGD(self.critic.parameters(), critic_lr, momentum=0.9) else: self.optim_actor = torch.optim.Adam(self.actor.parameters(), actor_lr) self.optim_critic = torch.optim.Adam(self.critic.parameters(), critic_lr) self.pipe = pipe self.batch_size = batch_size self.mse = nn.MSELoss() self.discount = discount self.td_lambda = td_lambda self.val_norm = 1.0 / (1.0 - self.discount) self.action_mean = ((action_space.high + action_space.low) / 2)[None, :] self.action_std = ((action_space.high - action_space.low) / 2)[None, :] def set_params(self, params): assert isinstance(params, dict) _set_flat_params_or_grads(self.actor, params['actor'], mode='params'), _set_flat_params_or_grads(self.critic, params['critic'], mode='params') def get_params(self): return { 'actor': _get_flat_params_or_grads(self.actor, mode='params'), 'critic': _get_flat_params_or_grads(self.critic, mode='params') } def sync_grads(self, net, weight=None): # reweight the gradients to avoid any bias... grad = _get_flat_params_or_grads(net, mode='grad') if weight is not None: grad = grad * weight self.pipe.send(grad) grad = self.pipe.recv() _set_flat_params_or_grads(net, grad, mode='grad') def update_normalizer(self, obs): data = [obs.sum(axis=0), (obs**2).sum(axis=0), obs.shape[0]] self.pipe.send(data) s, sq, count = self.pipe.recv() self.normalizer.add( torch.tensor(s, dtype=torch.float32, device=self.device), torch.tensor(sq, dtype=torch.float32, device=self.device), torch.tensor(count, dtype=torch.long, device=self.device), ) def tensor(self, x): return torch.tensor(x, dtype=torch.float).to(self.device) def as_state(self, s): return self.normalizer(self.tensor(s)) def gen(self, sample_idx, steps, batch_size): while steps > 0: np.random.shuffle(sample_idx) for i in range(len(sample_idx) // batch_size): if steps <= 0: break yield sample_idx[i * batch_size:(i + 1) * batch_size] steps -= 1 def update_actor(self, steps, states, normed_actions, normed_advs, sample_idx, process_weight=None): # we assume all numpy states is not normalized ... #NOTE: it's better to have uniform sampling num_idx = len(states) for idx in self.gen(sample_idx, steps, self.batch_size): self.optim_actor.zero_grad() weights = np.minimum(np.exp(normed_advs[idx] / self.temp), self.max_weight) weights = self.tensor(weights) distribution = self.actor(self.as_state(states[idx])) logpi = -distribution.log_prob(self.tensor( normed_actions[idx])).sum(dim=-1) assert logpi.shape == weights.shape, f"logpi size: {logpi.shape}, weights size: {weights.shape}" actor_loss = (logpi * weights).mean() actor_loss.backward() self.sync_grads(self.actor, process_weight) self.optim_actor.step() def update_critic(self, steps, states, targets, sample_idx, process_weight=None): num_idx = len(states) for idx in self.gen(sample_idx, steps, self.batch_size): self.optim_critic.zero_grad() normed_val = self.critic(self.as_state(states[idx])) normed_target = self.tensor(targets[idx]) / self.val_norm assert normed_val.shape == normed_target.shape critic_loss = self.mse(normed_val, normed_target) critic_loss.backward() self.sync_grads(self.critic, process_weight) self.optim_critic.step() def tocpu(self, x): return x.detach().cpu().numpy() def act(self, obs, mode='sample'): obs = self.as_state(obs) p = self.actor(obs) if mode == 'sample': a = p.sample() else: a = p.loc return self.tocpu(a) * self.action_std + self.action_mean def value(self, obs): # the value networks' output is unnormalized term .. return self.tocpu(self.critic(self.as_state(obs)) * self.val_norm) def update(self, buffer, critic_steps, actor_steps, ADV_EPS=1e-5, process_weight=None): states, actions, rewards, dones = [np.array(i) for i in buffer] # normalize action actions = (actions - self.action_mean) / self.action_std dones = dones.astype(np.bool) valid_mask = ~dones sample_idx = np.arange(len(states))[valid_mask] values = self.value(states) discount_return = self.discount_return(rewards, dones, values, self.discount, self.td_lambda) #print('critic', critic_steps, end='\n\n') self.update_critic(critic_steps, states, discount_return, sample_idx, process_weight=process_weight) #print('finish critic', end='\n\n') # normalize advantages.. # we need to compute the value again values = self.value(states) discount_return = self.discount_return(rewards, dones, values, self.discount, self.td_lambda) adv = discount_return - values adv_valid = adv[valid_mask] adv_norm = (adv - adv_valid.mean()) / (adv_valid.std() + ADV_EPS) #print('actor', actor_steps) self.update_actor(actor_steps, states, actions, adv_norm, sample_idx, process_weight=process_weight) #print('finish actor') def discount_return(self, reward, done, value, discount, td_lambda): num_step = len(value) return_t = np.zeros([num_step]) nxt = 0 for t in range(num_step - 1, -1, -1): if done[t]: #nxt = value[t] nxt = 0 else: nxt_return = reward[t] + discount * nxt return_t[t] = nxt_return nxt = (1.0 - td_lambda) * value[t] + td_lambda * nxt_return return return_t def save(self, path): pipe = self.pipe self.pipe = None torch.save(self, path) self.pipe = pipe