def update(self, loss): self.average_loss += ((1 - self.average_loss_decay) * (asfloat(loss) - self.average_loss)) # Compute gradients using thread-specific model self.model.cleargrads() F.squeeze(loss).backward() if self.train_async: # Copy the gradients to the globally shared model copy_param.copy_grad(target_link=self.shared_model, source_link=self.model) if self.process_idx == 0: xp = self.xp norm = sum( xp.sum(xp.square(param.grad)) for param in self.optimizer.target.params() if param.grad is not None) self.logger.debug('grad norm:%s', norm) self.optimizer.update() if self.train_async: self.sync_parameters() if isinstance(self.model, Recurrent): self.model.unchain_backward()
def update(self, t_start, t_stop, R, states, actions, rewards, values, action_values, action_distribs, action_distribs_mu, avg_action_distribs): assert np.isscalar(R) total_loss = self.compute_loss(t_start=t_start, t_stop=t_stop, R=R, states=states, actions=actions, rewards=rewards, values=values, action_values=action_values, action_distribs=action_distribs, action_distribs_mu=action_distribs_mu, avg_action_distribs=avg_action_distribs) # Compute gradients using thread-specific model self.model.zerograds() total_loss.backward() # Copy the gradients to the globally shared model self.shared_model.zerograds() copy_param.copy_grad(target_link=self.shared_model, source_link=self.model) # Update the globally shared model if self.process_idx == 0: norm = self.optimizer.compute_grads_norm() self.logger.debug('grad norm:%s', norm) self.optimizer.update() self.sync_parameters() if isinstance(self.model, Recurrent): self.model.unchain_backward()
def update(self, statevar): assert self.t_start < self.t # Update if statevar is None: R = 0 else: with state_kept(self.target_q_function): R = float(self.target_q_function(statevar).max.data) loss = 0 for i in reversed(range(self.t_start, self.t)): R *= self.gamma R += self.past_rewards[i] q = F.reshape(self.past_action_values[i], (1, 1)) # Accumulate gradients of Q-function loss += F.sum( F.huber_loss(q, chainer.Variable( np.asarray([[R]], dtype=np.float32)), delta=1.0)) # Do we need to normalize losses by (self.t - self.t_start)? # Otherwise, loss scales can be different in case of self.t_max # and in case of termination. # I'm not sure but if we need to normalize losses... # loss /= self.t - self.t_start # Compute gradients using thread-specific model self.q_function.zerograds() loss.backward() # Copy the gradients to the globally shared model self.shared_q_function.zerograds() copy_param.copy_grad(self.shared_q_function, self.q_function) # Update the globally shared model self.optimizer.update() self.sync_parameters() if isinstance(self.q_function, Recurrent): self.q_function.unchain_backward() self.past_action_values = {} self.past_states = {} self.past_rewards = {} self.t_start = self.t
def update(self, loss): self.average_loss += ((1 - self.average_loss_decay) * (asfloat(loss) - self.average_loss)) # Compute gradients using thread-specific model self.model.zerograds() loss.backward() if self.train_async: # Copy the gradients to the globally shared model self.shared_model.zerograds() copy_param.copy_grad(target_link=self.shared_model, source_link=self.model) if self.process_idx == 0: norm = self.optimizer.compute_grads_norm() self.logger.debug('grad norm:%s', norm) self.optimizer.update() if self.train_async: self.sync_parameters() if isinstance(self.model, Recurrent): self.model.unchain_backward()
def update(self, statevar): assert self.t_start < self.t if statevar is None: R = 0 else: with state_kept(self.model): _, vout, __ = self.model.pi_and_v(statevar) ####################### R = F.cast(vout.data, 'float32') #R = float(vout.data) ####################### pi_loss = 0 v_loss = 0 for i in reversed(range(self.t_start, self.t)): R *= self.gamma R += self.past_rewards[i] if self.use_average_reward: R -= self.average_reward v = self.past_values[i] advantage = R - v if self.use_average_reward: self.average_reward += self.average_reward_tau * \ float(advantage.data) # Accumulate gradients of policy log_prob = self.past_action_log_prob[i] entropy = self.past_action_entropy[i] # Log probability is increased proportionally to advantage ############################## pi_loss -= log_prob * F.cast(advantage.data, 'float32') #pi_loss -= log_prob * float(advantage.data) ############################## # Entropy is maximized pi_loss -= self.beta * entropy # Accumulate gradients of value function v_loss += (v - R) ** 2 / 2 if self.pi_loss_coef != 1.0: pi_loss *= self.pi_loss_coef if self.v_loss_coef != 1.0: v_loss *= self.v_loss_coef # Normalize the loss of sequences truncated by terminal states if self.keep_loss_scale_same and \ self.t - self.t_start < self.t_max: factor = self.t_max / (self.t - self.t_start) pi_loss *= factor v_loss *= factor if self.normalize_grad_by_t_max: pi_loss /= self.t - self.t_start v_loss /= self.t - self.t_start if self.process_idx == 0: logger.debug('pi_loss:%s v_loss:%s', pi_loss.data, v_loss.data) ########################## #total_loss = pi_loss + F.reshape(v_loss, pi_loss.data.shape) total_loss = F.mean(pi_loss + F.reshape(v_loss, pi_loss.data.shape)) ########################## # Compute gradients using thread-specific model self.model.zerograds() total_loss.backward() # Copy the gradients to the globally shared model self.shared_model.zerograds() copy_param.copy_grad( target_link=self.shared_model, source_link=self.model) # Update the globally shared model if self.process_idx == 0: norm = sum(np.sum(np.square(param.grad)) for param in self.optimizer.target.params()) logger.debug('grad norm:%s', norm) self.optimizer.update() if self.process_idx == 0: logger.debug('update') self.sync_parameters() if isinstance(self.model, Recurrent): self.model.unchain_backward() self.past_action_log_prob = {} self.past_action_entropy = {} self.past_states = {} self.past_rewards = {} self.past_values = {} self.t_start = self.t
def update(self, t_start, t_stop, R, states, actions, rewards, values, action_values, action_log_probs, action_distribs, avg_action_distribs, rho=None, rho_all=None): pi_loss = 0 Q_loss = 0 Q_ret = R del R for i in reversed(range(t_start, t_stop)): r = rewards[i] v = values[i] log_prob = action_log_probs[i] assert isinstance(log_prob, chainer.Variable),\ "log_prob must be backprop-able" action_distrib = action_distribs[i] avg_action_distrib = avg_action_distribs[i] ba = np.expand_dims(actions[i], 0) action_value = action_values[i] Q_ret = r + self.gamma * Q_ret with chainer.no_backprop_mode(): advantage = Q_ret - v pi_loss += self.compute_one_step_pi_loss( advantage=advantage, action_distrib=action_distrib, log_prob=log_prob, rho=rho[i] if rho else None, rho_all=rho_all[i] if rho_all else None, action_value=action_value, v=v, avg_action_distrib=avg_action_distrib) # Accumulate gradients of value function Q = action_value.evaluate_actions(ba) assert isinstance(Q, chainer.Variable), "Q must be backprop-able" Q_loss += (Q_ret - Q)**2 / 2 if self.process_idx == 0: logger.debug('t:%s s:%s v:%s Q:%s Q_ret:%s', i, states[i].sum(), v, float(Q.data), Q_ret) if rho is not None: Q_ret = min(1, rho[i]) * (Q_ret - float(Q.data)) + v else: Q_ret = Q_ret - float(Q.data) + v pi_loss *= self.pi_loss_coef Q_loss *= self.Q_loss_coef if self.normalize_loss_by_steps: pi_loss /= t_stop - t_start Q_loss /= t_stop - t_start if self.process_idx == 0: logger.debug('pi_loss:%s Q_loss:%s', pi_loss.data, Q_loss.data) total_loss = pi_loss + F.reshape(Q_loss, pi_loss.data.shape) # Compute gradients using thread-specific model self.model.zerograds() total_loss.backward() # Copy the gradients to the globally shared model self.shared_model.zerograds() copy_param.copy_grad(target_link=self.shared_model, source_link=self.model) # Update the globally shared model if self.process_idx == 0: norm = self.optimizer.compute_grads_norm() logger.debug('grad norm:%s', norm) self.optimizer.update() self.sync_parameters() if isinstance(self.model, Recurrent): self.model.unchain_backward()
def test_copy_grad(self): def set_random_grad(link): link.cleargrads() x = np.random.normal(size=(1, 1)).astype(np.float32) y = link(x) * np.random.normal() F.sum(y).backward() # When source is not None and target is None a = L.Linear(1, 5) b = L.Linear(1, 5) set_random_grad(a) b.cleargrads() assert a.W.grad is not None assert a.b.grad is not None assert b.W.grad is None assert b.b.grad is None copy_param.copy_grad(target_link=b, source_link=a) np.testing.assert_almost_equal(a.W.grad, b.W.grad) np.testing.assert_almost_equal(a.b.grad, b.b.grad) assert a.W.grad is not b.W.grad assert a.b.grad is not b.b.grad # When both are not None a = L.Linear(1, 5) b = L.Linear(1, 5) set_random_grad(a) set_random_grad(b) assert a.W.grad is not None assert a.b.grad is not None assert b.W.grad is not None assert b.b.grad is not None copy_param.copy_grad(target_link=b, source_link=a) np.testing.assert_almost_equal(a.W.grad, b.W.grad) np.testing.assert_almost_equal(a.b.grad, b.b.grad) assert a.W.grad is not b.W.grad assert a.b.grad is not b.b.grad # When source is None and target is not None a = L.Linear(1, 5) b = L.Linear(1, 5) a.cleargrads() set_random_grad(b) assert a.W.grad is None assert a.b.grad is None assert b.W.grad is not None assert b.b.grad is not None copy_param.copy_grad(target_link=b, source_link=a) assert a.W.grad is None assert a.b.grad is None assert b.W.grad is None assert b.b.grad is None # When both are None a = L.Linear(1, 5) b = L.Linear(1, 5) a.cleargrads() b.cleargrads() assert a.W.grad is None assert a.b.grad is None assert b.W.grad is None assert b.b.grad is None copy_param.copy_grad(target_link=b, source_link=a) assert a.W.grad is None assert a.b.grad is None assert b.W.grad is None assert b.b.grad is None
def __update(self): """ update generator and discriminator at the end of drawing """ if self.process_idx == 0: logger.debug('Accumulate grads') pi_loss = 0 v_loss = 0 for n in reversed(range(self.rollout_n)): R = self.lambda_R * self.past_R[n] # prob by the discriminator for t in reversed(range(self.max_episode_steps)): R *= self.gamma # discount factor R += self.past_reward[n, t] v = self.past_values[n, t] advantage = R - v log_prob = self.past_action_log_prob[n, t] entropy = self.past_action_entropy[n, t] pi_loss -= log_prob * float(advantage.data) pi_loss -= self.beta * entropy v_loss += (v - R)**2 / 2 if self.pi_loss_coef != 1.0: pi_loss *= self.pi_loss_coef if self.v_loss_coef != 1.0: v_loss *= self.v_loss_coef # normalize by each step pi_loss /= self.max_episode_steps * self.rollout_n v_loss /= self.max_episode_steps * self.rollout_n total_loss = pi_loss + F.reshape(v_loss, pi_loss.data.shape) if self.process_idx == 0: logger.debug('pi_loss:%s v_loss:%s', pi_loss.data, v_loss.data) # compute gradients of the generator self.generator.zerograds() total_loss.backward() # copy the gradients of the local generator to the globally shared model self.shared_generator.zerograds() copy_param.copy_grad(target_link=self.shared_generator, source_link=self.generator) # update the gobally shared model if self.process_idx == 0: norm = sum( np.sum(np.square(param.grad)) for param in self.gen_optimizer.target.params()) logger.debug('grad_norm of generator: %s', norm) self.gen_optimizer.update() # update the local discriminator if self.reward_mode in ('dcgan', 'wgangp'): x_fake = F.concat(self.fake_data.values(), axis=0) x_real = F.concat(self.real_data.values(), axis=0) y_fake = F.concat(self.y_fake.values()) if self.conditional: y_real = self.discriminator(x_real, x_real) else: y_real = self.discriminator(x_real) self.__compute_discriminator_grad(x_real, x_fake, y_real, y_fake) # copy the gradients of the local discriminator to the globall shared model self.shared_discriminator.zerograds() copy_param.copy_grad(target_link=self.shared_discriminator, source_link=self.discriminator) # Perform asynchronous update self.dis_optimizer.update() self.sync_parameters() self.generator.unchain_backward() # update statistics self.stat_pi_loss = float(pi_loss.data) self.stat_v_loss = float(v_loss.data) self.stat_R = np.array(list(self.past_R.values())).mean() self.stat_reward_min = self.past_reward.min() self.stat_reward_max = self.past_reward.max() self.stat_reward_mean = self.past_reward.mean() self.stat_reward_std = self.past_reward.std() # update counter self.update_n += 1