Ejemplo n.º 1
0
    def update(self, loss):

        self.average_loss += ((1 - self.average_loss_decay) *
                              (asfloat(loss) - self.average_loss))

        # Compute gradients using thread-specific model
        self.model.cleargrads()
        F.squeeze(loss).backward()
        if self.train_async:
            # Copy the gradients to the globally shared model
            copy_param.copy_grad(target_link=self.shared_model,
                                 source_link=self.model)
            if self.process_idx == 0:
                xp = self.xp
                norm = sum(
                    xp.sum(xp.square(param.grad))
                    for param in self.optimizer.target.params()
                    if param.grad is not None)
                self.logger.debug('grad norm:%s', norm)
        self.optimizer.update()

        if self.train_async:
            self.sync_parameters()
        if isinstance(self.model, Recurrent):
            self.model.unchain_backward()
Ejemplo n.º 2
0
    def update(self, t_start, t_stop, R, states, actions, rewards, values,
               action_values, action_distribs, action_distribs_mu,
               avg_action_distribs):

        assert np.isscalar(R)

        total_loss = self.compute_loss(t_start=t_start,
                                       t_stop=t_stop,
                                       R=R,
                                       states=states,
                                       actions=actions,
                                       rewards=rewards,
                                       values=values,
                                       action_values=action_values,
                                       action_distribs=action_distribs,
                                       action_distribs_mu=action_distribs_mu,
                                       avg_action_distribs=avg_action_distribs)

        # Compute gradients using thread-specific model
        self.model.zerograds()
        total_loss.backward()
        # Copy the gradients to the globally shared model
        self.shared_model.zerograds()
        copy_param.copy_grad(target_link=self.shared_model,
                             source_link=self.model)
        # Update the globally shared model
        if self.process_idx == 0:
            norm = self.optimizer.compute_grads_norm()
            self.logger.debug('grad norm:%s', norm)
        self.optimizer.update()

        self.sync_parameters()
        if isinstance(self.model, Recurrent):
            self.model.unchain_backward()
Ejemplo n.º 3
0
    def update(self, statevar):
        assert self.t_start < self.t

        # Update
        if statevar is None:
            R = 0
        else:
            with state_kept(self.target_q_function):
                R = float(self.target_q_function(statevar).max.data)

        loss = 0
        for i in reversed(range(self.t_start, self.t)):
            R *= self.gamma
            R += self.past_rewards[i]
            q = F.reshape(self.past_action_values[i], (1, 1))
            # Accumulate gradients of Q-function
            loss += F.sum(
                F.huber_loss(q,
                             chainer.Variable(
                                 np.asarray([[R]], dtype=np.float32)),
                             delta=1.0))

        # Do we need to normalize losses by (self.t - self.t_start)?
        # Otherwise, loss scales can be different in case of self.t_max
        # and in case of termination.

        # I'm not sure but if we need to normalize losses...
        # loss /= self.t - self.t_start

        # Compute gradients using thread-specific model
        self.q_function.zerograds()
        loss.backward()
        # Copy the gradients to the globally shared model
        self.shared_q_function.zerograds()
        copy_param.copy_grad(self.shared_q_function, self.q_function)
        # Update the globally shared model
        self.optimizer.update()

        self.sync_parameters()
        if isinstance(self.q_function, Recurrent):
            self.q_function.unchain_backward()

        self.past_action_values = {}
        self.past_states = {}
        self.past_rewards = {}

        self.t_start = self.t
Ejemplo n.º 4
0
    def update(self, loss):

        self.average_loss += ((1 - self.average_loss_decay) *
                              (asfloat(loss) - self.average_loss))

        # Compute gradients using thread-specific model
        self.model.zerograds()
        loss.backward()
        if self.train_async:
            # Copy the gradients to the globally shared model
            self.shared_model.zerograds()
            copy_param.copy_grad(target_link=self.shared_model,
                                 source_link=self.model)
            if self.process_idx == 0:
                norm = self.optimizer.compute_grads_norm()
                self.logger.debug('grad norm:%s', norm)
        self.optimizer.update()

        if self.train_async:
            self.sync_parameters()
        if isinstance(self.model, Recurrent):
            self.model.unchain_backward()
Ejemplo n.º 5
0
    def update(self, statevar):
        assert self.t_start < self.t

        if statevar is None:
            R = 0
        else:
            with state_kept(self.model):
                _, vout, __ = self.model.pi_and_v(statevar)
#######################
            R = F.cast(vout.data, 'float32')
            #R = float(vout.data)
#######################

        pi_loss = 0
        v_loss = 0
        for i in reversed(range(self.t_start, self.t)):
            R *= self.gamma
            R += self.past_rewards[i]
            if self.use_average_reward:
                R -= self.average_reward
            v = self.past_values[i]
            advantage = R - v
            if self.use_average_reward:
                self.average_reward += self.average_reward_tau * \
                    float(advantage.data)
            # Accumulate gradients of policy
            log_prob = self.past_action_log_prob[i]
            entropy = self.past_action_entropy[i]

            # Log probability is increased proportionally to advantage
##############################
            pi_loss -= log_prob * F.cast(advantage.data, 'float32')
            #pi_loss -= log_prob * float(advantage.data)
##############################
            # Entropy is maximized
            pi_loss -= self.beta * entropy
            # Accumulate gradients of value function
            v_loss += (v - R) ** 2 / 2

        if self.pi_loss_coef != 1.0:
            pi_loss *= self.pi_loss_coef

        if self.v_loss_coef != 1.0:
            v_loss *= self.v_loss_coef

        # Normalize the loss of sequences truncated by terminal states
        if self.keep_loss_scale_same and \
                self.t - self.t_start < self.t_max:
            factor = self.t_max / (self.t - self.t_start)
            pi_loss *= factor
            v_loss *= factor

        if self.normalize_grad_by_t_max:
            pi_loss /= self.t - self.t_start
            v_loss /= self.t - self.t_start

        if self.process_idx == 0:
            logger.debug('pi_loss:%s v_loss:%s', pi_loss.data, v_loss.data)

##########################
        #total_loss = pi_loss + F.reshape(v_loss, pi_loss.data.shape)
        total_loss = F.mean(pi_loss + F.reshape(v_loss, pi_loss.data.shape))
##########################

        # Compute gradients using thread-specific model
        self.model.zerograds()
        total_loss.backward()
        # Copy the gradients to the globally shared model
        self.shared_model.zerograds()
        copy_param.copy_grad(
            target_link=self.shared_model, source_link=self.model)
        # Update the globally shared model
        if self.process_idx == 0:
            norm = sum(np.sum(np.square(param.grad))
                       for param in self.optimizer.target.params())
            logger.debug('grad norm:%s', norm)
        self.optimizer.update()
        if self.process_idx == 0:
            logger.debug('update')

        self.sync_parameters()
        if isinstance(self.model, Recurrent):
            self.model.unchain_backward()

        self.past_action_log_prob = {}
        self.past_action_entropy = {}
        self.past_states = {}
        self.past_rewards = {}
        self.past_values = {}

        self.t_start = self.t
Ejemplo n.º 6
0
    def update(self,
               t_start,
               t_stop,
               R,
               states,
               actions,
               rewards,
               values,
               action_values,
               action_log_probs,
               action_distribs,
               avg_action_distribs,
               rho=None,
               rho_all=None):

        pi_loss = 0
        Q_loss = 0
        Q_ret = R
        del R
        for i in reversed(range(t_start, t_stop)):
            r = rewards[i]
            v = values[i]
            log_prob = action_log_probs[i]
            assert isinstance(log_prob, chainer.Variable),\
                "log_prob must be backprop-able"
            action_distrib = action_distribs[i]
            avg_action_distrib = avg_action_distribs[i]
            ba = np.expand_dims(actions[i], 0)
            action_value = action_values[i]

            Q_ret = r + self.gamma * Q_ret

            with chainer.no_backprop_mode():
                advantage = Q_ret - v

            pi_loss += self.compute_one_step_pi_loss(
                advantage=advantage,
                action_distrib=action_distrib,
                log_prob=log_prob,
                rho=rho[i] if rho else None,
                rho_all=rho_all[i] if rho_all else None,
                action_value=action_value,
                v=v,
                avg_action_distrib=avg_action_distrib)

            # Accumulate gradients of value function
            Q = action_value.evaluate_actions(ba)
            assert isinstance(Q, chainer.Variable), "Q must be backprop-able"
            Q_loss += (Q_ret - Q)**2 / 2

            if self.process_idx == 0:
                logger.debug('t:%s s:%s v:%s Q:%s Q_ret:%s', i,
                             states[i].sum(), v, float(Q.data), Q_ret)

            if rho is not None:
                Q_ret = min(1, rho[i]) * (Q_ret - float(Q.data)) + v
            else:
                Q_ret = Q_ret - float(Q.data) + v

        pi_loss *= self.pi_loss_coef
        Q_loss *= self.Q_loss_coef

        if self.normalize_loss_by_steps:
            pi_loss /= t_stop - t_start
            Q_loss /= t_stop - t_start

        if self.process_idx == 0:
            logger.debug('pi_loss:%s Q_loss:%s', pi_loss.data, Q_loss.data)

        total_loss = pi_loss + F.reshape(Q_loss, pi_loss.data.shape)

        # Compute gradients using thread-specific model
        self.model.zerograds()
        total_loss.backward()
        # Copy the gradients to the globally shared model
        self.shared_model.zerograds()
        copy_param.copy_grad(target_link=self.shared_model,
                             source_link=self.model)
        # Update the globally shared model
        if self.process_idx == 0:
            norm = self.optimizer.compute_grads_norm()
            logger.debug('grad norm:%s', norm)
        self.optimizer.update()

        self.sync_parameters()
        if isinstance(self.model, Recurrent):
            self.model.unchain_backward()
Ejemplo n.º 7
0
    def test_copy_grad(self):
        def set_random_grad(link):
            link.cleargrads()
            x = np.random.normal(size=(1, 1)).astype(np.float32)
            y = link(x) * np.random.normal()
            F.sum(y).backward()

        # When source is not None and target is None
        a = L.Linear(1, 5)
        b = L.Linear(1, 5)
        set_random_grad(a)
        b.cleargrads()
        assert a.W.grad is not None
        assert a.b.grad is not None
        assert b.W.grad is None
        assert b.b.grad is None
        copy_param.copy_grad(target_link=b, source_link=a)
        np.testing.assert_almost_equal(a.W.grad, b.W.grad)
        np.testing.assert_almost_equal(a.b.grad, b.b.grad)
        assert a.W.grad is not b.W.grad
        assert a.b.grad is not b.b.grad

        # When both are not None
        a = L.Linear(1, 5)
        b = L.Linear(1, 5)
        set_random_grad(a)
        set_random_grad(b)
        assert a.W.grad is not None
        assert a.b.grad is not None
        assert b.W.grad is not None
        assert b.b.grad is not None
        copy_param.copy_grad(target_link=b, source_link=a)
        np.testing.assert_almost_equal(a.W.grad, b.W.grad)
        np.testing.assert_almost_equal(a.b.grad, b.b.grad)
        assert a.W.grad is not b.W.grad
        assert a.b.grad is not b.b.grad

        # When source is None and target is not None
        a = L.Linear(1, 5)
        b = L.Linear(1, 5)
        a.cleargrads()
        set_random_grad(b)
        assert a.W.grad is None
        assert a.b.grad is None
        assert b.W.grad is not None
        assert b.b.grad is not None
        copy_param.copy_grad(target_link=b, source_link=a)
        assert a.W.grad is None
        assert a.b.grad is None
        assert b.W.grad is None
        assert b.b.grad is None

        # When both are None
        a = L.Linear(1, 5)
        b = L.Linear(1, 5)
        a.cleargrads()
        b.cleargrads()
        assert a.W.grad is None
        assert a.b.grad is None
        assert b.W.grad is None
        assert b.b.grad is None
        copy_param.copy_grad(target_link=b, source_link=a)
        assert a.W.grad is None
        assert a.b.grad is None
        assert b.W.grad is None
        assert b.b.grad is None
Ejemplo n.º 8
0
    def __update(self):
        """ update generator and discriminator at the end of drawing """
        if self.process_idx == 0:
            logger.debug('Accumulate grads')

        pi_loss = 0
        v_loss = 0

        for n in reversed(range(self.rollout_n)):
            R = self.lambda_R * self.past_R[n]  # prob by the discriminator

            for t in reversed(range(self.max_episode_steps)):
                R *= self.gamma  # discount factor
                R += self.past_reward[n, t]
                v = self.past_values[n, t]
                advantage = R - v

                log_prob = self.past_action_log_prob[n, t]
                entropy = self.past_action_entropy[n, t]

                pi_loss -= log_prob * float(advantage.data)
                pi_loss -= self.beta * entropy

                v_loss += (v - R)**2 / 2

        if self.pi_loss_coef != 1.0:
            pi_loss *= self.pi_loss_coef
        if self.v_loss_coef != 1.0:
            v_loss *= self.v_loss_coef

        # normalize by each step
        pi_loss /= self.max_episode_steps * self.rollout_n
        v_loss /= self.max_episode_steps * self.rollout_n

        total_loss = pi_loss + F.reshape(v_loss, pi_loss.data.shape)

        if self.process_idx == 0:
            logger.debug('pi_loss:%s v_loss:%s', pi_loss.data, v_loss.data)

        # compute gradients of the generator
        self.generator.zerograds()
        total_loss.backward()

        # copy the gradients of the local generator to the globally shared model
        self.shared_generator.zerograds()
        copy_param.copy_grad(target_link=self.shared_generator,
                             source_link=self.generator)

        # update the gobally shared model
        if self.process_idx == 0:
            norm = sum(
                np.sum(np.square(param.grad))
                for param in self.gen_optimizer.target.params())
            logger.debug('grad_norm of generator: %s', norm)
        self.gen_optimizer.update()

        # update the local discriminator
        if self.reward_mode in ('dcgan', 'wgangp'):
            x_fake = F.concat(self.fake_data.values(), axis=0)
            x_real = F.concat(self.real_data.values(), axis=0)
            y_fake = F.concat(self.y_fake.values())

            if self.conditional:
                y_real = self.discriminator(x_real, x_real)
            else:
                y_real = self.discriminator(x_real)

            self.__compute_discriminator_grad(x_real, x_fake, y_real, y_fake)

            # copy the gradients of the local discriminator to the globall shared model
            self.shared_discriminator.zerograds()
            copy_param.copy_grad(target_link=self.shared_discriminator,
                                 source_link=self.discriminator)

            # Perform asynchronous update
            self.dis_optimizer.update()

        self.sync_parameters()
        self.generator.unchain_backward()

        # update statistics
        self.stat_pi_loss = float(pi_loss.data)
        self.stat_v_loss = float(v_loss.data)
        self.stat_R = np.array(list(self.past_R.values())).mean()
        self.stat_reward_min = self.past_reward.min()
        self.stat_reward_max = self.past_reward.max()
        self.stat_reward_mean = self.past_reward.mean()
        self.stat_reward_std = self.past_reward.std()

        # update counter
        self.update_n += 1