Esempio n. 1
0
 def test_minimum_inconsistent_shapes(self):
     x1_data = numpy.random.uniform(-1, 1, (3, 2)).astype(self.dtype)
     x2_data = numpy.random.uniform(-1, 1, (2, 3)).astype(self.dtype)
     x1 = chainer.Variable(x1_data)
     x2 = chainer.Variable(x2_data)
     with self.assertRaises(type_check.InvalidType):
         functions.minimum(x1, x2)
 def test_minimum_inconsistent_shapes(self):
     x1_data = numpy.random.uniform(-1, 1, (3, 2)).astype(self.dtype)
     x2_data = numpy.random.uniform(-1, 1, (2, 3)).astype(self.dtype)
     x1 = chainer.Variable(x1_data)
     x2 = chainer.Variable(x2_data)
     with self.assertRaises(type_check.InvalidType):
         functions.minimum(x1, x2)
Esempio n. 3
0
def intersection(bbox0, bbox1):
    x0, y0, w0, h0 = bbox0
    x1, y1, w1, h1 = bbox1

    w = F.relu(F.minimum(x0 + w0 / 2, x1 + w1 / 2) - F.maximum(x0 - w0 / 2, x1 - w1 / 2))
    h = F.relu(F.minimum(y0 + h0 / 2, y1 + h1 / 2) - F.maximum(y0 - h0 / 2, y1 - h1 / 2))

    return w * h
Esempio n. 4
0
    def _perform_gradient_step(self, iterator):
        batch = iterator.next()
        s_current, action, r, s_next, non_terminal = \
            concat_examples(batch, device=self._device)
        r = F.reshape(r, shape=(*r.shape, 1))
        non_terminal = F.reshape(non_terminal, shape=(*non_terminal.shape, 1))

        with chainer.using_config('enable_backprop',
                                  False), chainer.using_config('train', False):
            a_next, log_pi = self._pi.action_with_log_pi(s_next)
            log_pi = F.reshape(log_pi, shape=(*log_pi.shape, 1))

            # update q
            target_q1 = self._target_q1(s_next, a_next)
            target_q2 = self._target_q2(s_next, a_next)

            min_q = F.minimum(target_q1, target_q2)

            q_target = r + self._gamma * non_terminal * (
                min_q - self._alpha.exp() * log_pi)

        q1 = self._q1(s_current, action)
        q2 = self._q2(s_current, action)
        q1_loss = 0.5 * F.mean_squared_error(q_target, q1)
        q2_loss = 0.5 * F.mean_squared_error(q_target, q2)
        q_loss = q1_loss + q2_loss

        self._q1_optimizer.target.cleargrads()
        self._q2_optimizer.target.cleargrads()
        q_loss.backward()
        q_loss.unchain_backward()
        self._q1_optimizer.update()
        self._q2_optimizer.update()

        # update pi
        pi_action, log_pi = self._pi.action_with_log_pi(s_current)
        log_pi = F.reshape(log_pi, shape=(*log_pi.shape, 1))

        q1 = self._q1(s_current, pi_action)
        q2 = self._q2(s_current, pi_action)
        min_q = F.minimum(q1, q2)

        pi_loss = F.mean(self._alpha.exp() * log_pi - min_q)
        self._pi_optimizer.target.cleargrads()
        pi_loss.backward()
        pi_loss.unchain_backward()
        self._pi_optimizer.update()

        # update temperature
        alpha_loss = -self._alpha.exp() * F.mean(log_pi +
                                                 self._temperature_target)
        self._alpha_optimizer.target.cleargrads()
        alpha_loss.backward()
        alpha_loss.unchain_backward()
        self._alpha_optimizer.update()

        self._update_target_network(self._target_q1, self._q1, self._tau)
        self._update_target_network(self._target_q2, self._q2, self._tau)
Esempio n. 5
0
def loss_func_dsgan(x, z, theta, tau=10):
    if x.shape[1] == 4:
        x = x[:, :3]
    loss_ds_1 = F.batch_l2_norm_squared(x[::2] - x[1::2]) / (F.batch_l2_norm_squared(z[::2] - z[1::2]) + 1e-8)
    loss_ds_2 = F.batch_l2_norm_squared(x[::2] - x[1::2]) / (F.absolute(theta[::2] - theta[1::2]) + 1e-8) / 1000
    xp = chainer.cuda.get_array_module(x.array)
    loss_ds_1 = F.minimum(F.sqrt(loss_ds_1), xp.full_like(loss_ds_1.array, tau))
    loss_ds_2 = F.minimum(F.sqrt(loss_ds_2), xp.full_like(loss_ds_2.array, tau))
    print(loss_ds_1.array.mean(), loss_ds_2.array.mean())
    return -F.mean(loss_ds_1) - F.mean(loss_ds_2)
Esempio n. 6
0
File: ppo.py Progetto: natsumeS/ML
    def update_model(self):
        # start minibatch learning
        for t in range(self.num_train_per_episode):
            # get learning data
            with self.lock:
                states, actions, advantages = self.get_data_from_train_buffer()
            # get policy and value
            policies, values = self.model(states)
            old_policies, _ = self.old_model(states)

            # calculate loss
            loss_v = F.squared_error(values,
                                     np.array(advantages).astype(np.float32))
            loss_ent = -policies.entropy()

            r = (policies.get_prob(actions) +
                 1.0e-10) / (old_policies.get_prob(actions) + 1.0e-10)
            loss_clip = (advantages - values.data) * F.minimum(
                r, F.clip(r, 1.0 - self.eps, 1.0 + self.eps))

            loss = F.mean(-loss_clip + loss_v * 0.2 + 0.01 * loss_ent)

            self.model.cleargrads()
            loss.backward()
            self.optimizer.update()
        # update old model
        self.old_model = self.copy_model()
        self.clear_buffer()
Esempio n. 7
0
    def __call__(self, noise, tag, depth, alpha):
        h = F.reshape(noise, (len(noise), -1, 1, 1))
        h = F.leaky_relu(self.c0(h, False))
        h = F.leaky_relu(self.c1(h))

        for i in range(depth - 1):
            h = getattr(self, "b%d" % i)(h)

        if 0 < depth and alpha < 1:
            h2 = getattr(self, "b%d" % (depth - 1))(h, True)
            if depth == 1:
                #h = F.tanh(self.to_RGB(h))
                h = self.to_RGB(h, False)
            else:
                h = getattr(self, "b%d" % (depth - 2)).to_RGB(h, False)
            h = F.unpooling_2d(h,
                               2,
                               2,
                               0,
                               outsize=(2 * h.shape[2], 2 * h.shape[3]))

            h = h * (1.0 - alpha) + h2 * alpha
        elif depth == 0:
            #h = F.tanh(self.to_RGB(h))
            h = self.to_RGB(h, False)
        else:
            h = getattr(self, "b%d" % (depth - 1))(h, True)

        h = F.minimum(h, xp.ones(h.shape).astype(np.float32))
        #h = F.maximum(h,-1 * xp.ones(h.shape).astype(np.float32))

        return h
Esempio n. 8
0
def optimize_surrogate_loss(iterator, model, optimizer, alpha, args):
    optimizer.target.cleargrads()

    batch = iterator.next()
    s_current, action, _, _, log_likelihood, v_target, advantage = concat_examples(
        batch, device=args.gpu)

    log_pi_theta = model.compute_log_likelihood(s_current, action)
    log_pi_theta_old = log_likelihood
    # print('log_pi_theta: ', log_pi_theta, ' shape: ', log_pi_theta.shape)
    # print('log_pi_theta_old: ', log_pi_theta_old, ' shape: ', log_pi_theta_old.shape)
    # division of probability is exponential of difference between log probability
    probability_ratio = F.exp(log_pi_theta - log_pi_theta_old)
    clipped_ratio = F.clip(probability_ratio, 1 - args.epsilon * alpha,
                           1 + args.epsilon * alpha)
    lower_bounds = F.minimum(probability_ratio * advantage,
                             clipped_ratio * advantage)
    clip_loss = F.mean(lower_bounds)

    value = model.value(s_current)
    xp = chainer.backend.get_array_module(v_target)
    v_target = xp.reshape(v_target, newshape=value.shape)
    # print('value: ', value, ' shape: ', value.shape)
    # print('v_target: ', v_target, ' shape: ', v_target.shape)
    value_loss = F.mean_squared_error(value, v_target)

    entropy = model.compute_entropy(s_current)
    entropy_loss = F.mean(entropy)

    loss = -clip_loss + args.vf_coeff * value_loss - args.entropy_coeff * entropy_loss

    # Update parameter
    loss.backward()
    optimizer.update()
    loss.unchain_backward()
Esempio n. 9
0
    def update_policy_and_temperature(self, batch):
        """Compute loss for actor."""

        batch_state = batch['state']

        action_distrib = self.policy(batch_state)
        actions, log_prob = action_distrib.sample_with_log_prob()
        q1 = self.q_func1(batch_state, actions)
        q2 = self.q_func2(batch_state, actions)
        q = F.minimum(q1, q2)

        entropy_term = self.temperature * log_prob[..., None]
        assert q.shape == entropy_term.shape
        loss = F.mean(entropy_term - q)

        self.policy_optimizer.update(lambda: loss)

        if self.entropy_target is not None:
            self.update_temperature(log_prob.array)

        # Record entropy
        with chainer.no_backprop_mode():
            try:
                self.entropy_record.extend(
                    cuda.to_cpu(action_distrib.entropy.array))
            except NotImplementedError:
                # Record - log p(x) instead
                self.entropy_record.extend(cuda.to_cpu(-log_prob.array))
Esempio n. 10
0
    def _lossfun(self, entropy, vs_pred, log_probs, vs_pred_old, log_probs_old,
                 advs, vs_teacher):

        prob_ratio = F.exp(log_probs - log_probs_old)

        loss_policy = -F.mean(
            F.minimum(
                prob_ratio * advs,
                F.clip(prob_ratio, 1 - self.clip_eps, 1 + self.clip_eps) *
                advs))

        if self.clip_eps_vf is None:
            loss_value_func = F.mean_squared_error(vs_pred, vs_teacher)
        else:
            loss_value_func = F.mean(
                F.maximum(
                    F.square(vs_pred - vs_teacher),
                    F.square(
                        _elementwise_clip(vs_pred, vs_pred_old -
                                          self.clip_eps_vf, vs_pred_old +
                                          self.clip_eps_vf) - vs_teacher)))
        loss_entropy = -F.mean(entropy)

        self.value_loss_record.append(float(loss_value_func.array))
        self.policy_loss_record.append(float(loss_policy.array))

        loss = (loss_policy + self.value_func_coef * loss_value_func +
                self.entropy_coef * loss_entropy)

        return loss
 def check_forward(self, x1_data, x2_data, y_expected):
     x1 = chainer.Variable(x1_data)
     x2 = chainer.Variable(x2_data)
     y = functions.minimum(x1, x2)
     self.assertEqual(y.data.dtype, self.dtype)
     testing.assert_allclose(y_expected, y.data,
                             **self.check_forward_options)
Esempio n. 12
0
    def update_q_func(self, batch):
        """Compute loss for a given Q-function."""

        batch_next_state = batch['next_state']
        batch_rewards = batch['reward']
        batch_terminal = batch['is_state_terminal']
        batch_state = batch['state']
        batch_actions = batch['action']
        batch_discount = batch['discount']

        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            next_actions = self.target_policy_smoothing_func(
                self.target_policy(batch_next_state).sample().array)
            next_q1 = self.target_q_func1(batch_next_state, next_actions)
            next_q2 = self.target_q_func2(batch_next_state, next_actions)
            next_q = F.minimum(next_q1, next_q2)

            target_q = batch_rewards + batch_discount * \
                (1.0 - batch_terminal) * F.flatten(next_q)

        predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions))
        predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions))

        loss1 = F.mean_squared_error(target_q, predict_q1)
        loss2 = F.mean_squared_error(target_q, predict_q2)

        # Update stats
        self.q1_record.extend(cuda.to_cpu(predict_q1.array))
        self.q2_record.extend(cuda.to_cpu(predict_q2.array))
        self.q_func1_loss_record.append(float(loss1.array))
        self.q_func2_loss_record.append(float(loss2.array))

        self.q_func1_optimizer.update(lambda: loss1)
        self.q_func2_optimizer.update(lambda: loss2)
Esempio n. 13
0
 def __call__(self, x):
     x = F.log_softmax(x)
     h = x + x * F.broadcast_to(self.W, x.shape) + F.broadcast_to(self.b, x.shape)
     mx = F.maximum(h, F.broadcast_to(self.lb, x.shape))
     mn = F.minimum(h, F.broadcast_to(self.lb, x.shape))
     y = mx + F.log(1.0 + F.exp(mn - mx))
     return y
Esempio n. 14
0
 def check_forward(self, x1_data, x2_data, y_expected):
     x1 = chainer.Variable(x1_data)
     x2 = chainer.Variable(x2_data)
     y = functions.minimum(x1, x2)
     self.assertEqual(y.data.dtype, self.dtype)
     testing.assert_allclose(
         y_expected, y.data, **self.check_forward_options)
Esempio n. 15
0
 def minimum(self, a, b):
     assert a.dtype == b.dtype
     if a.dtype.name.startswith('float'):
         x = F.minimum(a, b)
     else:
         x = Variable(np.minimum(a.data, b.data))
     return x
Esempio n. 16
0
    def train(self, replay_buffer, iterations, d, clip_value, gamma, tau):
        if not self._initialized:
            self._initialize_target_networks()
        iterator = self._prepare_iterator(replay_buffer)
        for i in range(iterations):
            batch = iterator.next()
            s_current, action, r, s_next, non_terminal = concat_examples(
                batch, device=self._device)

            epsilon = F.clip(
                self._sample_action_noise(shape=(self._batch_size)),
                -clip_value, clip_value)
            target_pi = self._target_pi(s_current)
            assert target_pi.shape == epsilon.shape
            a_tilde = target_pi + epsilon

            target_q1 = self._target_q1(s_next, a_tilde)
            target_q2 = self._target_q2(s_next, a_tilde)

            r = F.reshape(r, shape=(*r.shape, 1))
            non_terminal = F.reshape(non_terminal,
                                     shape=(*non_terminal.shape, 1))
            min_q = F.minimum(target_q1, target_q2)
            # print('r shape: ', r.shape)
            # print('done shape: ', non_terminal.shape)
            # print('min q shape: ', min_q.shape)

            y = r + gamma * non_terminal * min_q
            # print('y shape: ', y.shape)
            # Remove reference to avoid unexpected gradient update
            y.unchain()

            q1 = self._q1(s_current, action)
            q1_loss = F.mean_squared_error(y, q1)
            q2 = self._q2(s_current, action)
            q2_loss = F.mean_squared_error(y, q2)
            critic_loss = q1_loss + q2_loss

            self._q1_optimizer.target.cleargrads()
            self._q2_optimizer.target.cleargrads()
            critic_loss.backward()
            critic_loss.unchain_backward()
            self._q1_optimizer.update()
            self._q2_optimizer.update()

            if i % d == 0:
                a = self._pi(s_current)
                q1 = self._q1(s_current, a)

                pi_loss = -F.mean(q1)

                self._pi_optimizer.target.cleargrads()
                pi_loss.backward()
                pi_loss.unchain_backward()
                self._pi_optimizer.update()

                self._update_target_network(self._target_q1, self._q1, tau)
                self._update_target_network(self._target_q2, self._q2, tau)
                self._update_target_network(self._target_pi, self._pi, tau)
Esempio n. 17
0
def multi_overlap(x1, len1, x2, len2):
    len1_half = len1/2
    len2_half = len2/2

    left = F.maximum(x1 - len1_half, x2 - len2_half)
    right = F.minimum(x1 + len1_half, x2 + len2_half)

    return right - left
Esempio n. 18
0
def multi_overlap(x1, len1, x2, len2):
    len1_half = len1 / 2
    len2_half = len2 / 2

    left = F.maximum(x1 - len1_half, x2 - len2_half)
    right = F.minimum(x1 + len1_half, x2 + len2_half)

    return right - left
Esempio n. 19
0
    def calc_height_loss(self, height):
        # penalize bboxes that are not high enough to contain text (10 pixels)
        shifted_height = height - 10
        thresholded_height = F.minimum(shifted_height,
                                       self.xp.zeros_like(shifted_height))
        thresholded_height *= -1

        return F.average(thresholded_height)
Esempio n. 20
0
def occupancy_grid_3d(points, *, pitch, origin, dims, threshold=1):
    d_IP, d_JP, d_KP = OccupancyGrid3D(pitch=pitch, origin=origin,
                                       dims=dims)(points)
    d_IJKP = F.sqrt(d_IP**2 + d_JP**2 + d_KP**2)
    d_IJK = F.min(d_IJKP, axis=3)
    m_IJK = F.relu(threshold - d_IJK)
    m_IJK = F.minimum(m_IJK, m_IJK.array * 0 + 1)
    return m_IJK
Esempio n. 21
0
 def greedy_actions(self):
     a = self.mu
     if self.min_action is not None:
         a = F.maximum(
             self.xp.broadcast_to(self.min_action, a.data.shape), a)
     if self.max_action is not None:
         a = F.minimum(
             self.xp.broadcast_to(self.max_action, a.data.shape), a)
     return a
Esempio n. 22
0
def occupancy_grid_2d(points, *, pitch, origin, dimension, threshold=1):
    d_IK, d_JK = OccupancyGrid2D(pitch=pitch,
                                 origin=origin,
                                 dimension=dimension)(points)
    d_IJK = F.sqrt(d_IK**2 + d_JK**2)
    m_IJK = F.relu(threshold - F.absolute(d_IJK))
    m_IJK = F.minimum(m_IJK, m_IJK.array * 0 + 1)
    m = F.max(m_IJK, axis=2)
    return m
Esempio n. 23
0
    def calc_bboxes(self, predicted_bboxes, image_size, out_size):
        predicted_bboxes = (predicted_bboxes + 1) / 2
        x_points = predicted_bboxes[:, 0, ...] * image_size.width
        y_points = predicted_bboxes[:, 1, ...] * image_size.height
        top_left_x = F.get_item(x_points, [..., 0, 0])
        top_left_y = F.get_item(y_points, [..., 0, 0])
        bottom_right_x = F.get_item(x_points, [..., out_size.height - 1, out_size.width - 1])
        bottom_right_y = F.get_item(y_points, [..., out_size.height - 1, out_size.width - 1])

        bboxes = F.stack(
            [
                F.minimum(top_left_x, bottom_right_x),
                F.minimum(top_left_y, bottom_right_y),
                F.maximum(top_left_x, bottom_right_x),
                F.maximum(top_left_y, bottom_right_y),
            ],
            axis=1
        )
        return bboxes
Esempio n. 24
0
 def greedy_actions(self):
     with chainer.force_backprop_mode():
         a = self.mu
         if self.min_action is not None:
             a = F.maximum(
                 self.xp.broadcast_to(self.min_action, a.array.shape), a)
         if self.max_action is not None:
             a = F.minimum(
                 self.xp.broadcast_to(self.max_action, a.array.shape), a)
         return a
 def _compute_ppo_loss(self, obs, acts, at, vt, old_params):
     params = self._pi_f(obs)
     cv = F.flatten(self._vf_f(obs))
     ratio = F.exp(self._logp(params, acts) - self._logp(old_params, acts))
     surr1 = ratio * at
     surr2 = F.clip(ratio, 1 - self._ppo_clipparam,
                    1 + self._ppo_clipparam) * at
     ppo_surr_loss = (
         -sym_mean(F.minimum(surr1, surr2)) +
         self._ppo_klcoeff * sym_mean(self.kl(old_params, params)) +
         sym_mean(F.square(cv - vt)))
     return ppo_surr_loss
Esempio n. 26
0
def batched_triangle_intersect_(p0, p1, p2, eps, fn, id, ro, rd, t0, t1):
    xp = chainer.backend.get_array_module(ro)
    BB = p0.shape[0]
    EB = p0.shape[0]
    _, _, H, W = ro.shape[:4]

    p0 = F.broadcast_to(p0.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    p1 = F.broadcast_to(p1.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    p2 = F.broadcast_to(p2.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    fn = F.broadcast_to(fn.reshape((BB, 3, 1, 1)), (BB, 3, H, W))
    id = F.broadcast_to(id.reshape((BB, 1, 1, 1)), (BB, 1, H, W))
    eps = F.broadcast_to(eps.reshape((EB, 1, 1, 1)), (BB, 1, H, W))
    ro = F.broadcast_to(ro.reshape((1, 3, H, W)), (BB, 3, H, W))
    rd = F.broadcast_to(rd.reshape((1, 3, H, W)), (BB, 3, H, W))
    t0 = F.broadcast_to(t0.reshape((1, 1, H, W)), (BB, 1, H, W))
    t1 = F.broadcast_to(t1.reshape((1, 1, H, W)), (BB, 1, H, W))

    aa = p0 - ro

    A = vdot(aa, fn)
    B = vdot(rd, fn)
    B = F.where(xp.abs(B.data) < eps.data, eps, B)

    #tx = F.where((xp.abs(A.data) < 1e-6)&(xp.abs(B.data) < 1e-6), t1, A / B)

    tx = F.maximum(t0, F.minimum(A / B, t1))
    p = ro + tx * rd

    e0 = p0.data - p.data
    e1 = p1.data - p.data
    e2 = p2.data - p.data
    n01 = vcross_(e0, e1, xp)
    n12 = vcross_(e1, e2, xp)
    n20 = vcross_(e2, e0, xp)

    MASK_P = is_positive_(vdot_(n01, n12, xp))
    MASK_Q = is_positive_(vdot_(n12, n20, xp))
    MASK_R = is_positive_(vdot_(n20, n01, xp))

    MASK_B = is_positive_(xp.abs(B.data))

    #MASK_TN = is_positive(tx)
    MASK_T0 = is_positive_(tx.data - t0.data)
    MASK_T1 = is_positive_(t1.data - tx.data)

    b = MASK_P & MASK_Q & MASK_R & MASK_B & MASK_T0 & MASK_T1

    t = F.where(b, tx, t1)
    p = ro + t * rd

    n = -xp.sign(vdot_(rd.data, fn.data, xp)) * fn

    return b, t, p, n, id
Esempio n. 27
0
def get_aabb_corners(grids, image_size):
    _, _, height, width = grids.shape
    grids = (grids + 1) / 2
    x_points = grids[:, 0, ...] * image_size.width
    y_points = grids[:, 1, ...] * image_size.height
    x_points = F.clip(x_points, 0., float(image_size.width))
    y_points = F.clip(y_points, 0., float(image_size.height))
    top_left_x = F.get_item(x_points, [..., 0, 0])
    top_left_y = F.get_item(y_points, [..., 0, 0])
    top_right_x = F.get_item(x_points, [..., 0, width - 1])
    top_right_y = F.get_item(y_points, [..., 0, width - 1])
    bottom_right_x = F.get_item(x_points, [..., height - 1, width - 1])
    bottom_right_y = F.get_item(y_points, [..., height - 1, width - 1])
    bottom_left_x = F.get_item(x_points, [..., height - 1, 0])
    bottom_left_y = F.get_item(y_points, [..., height - 1, 0])

    top_left_x_aabb = F.minimum(top_left_x, bottom_left_x)
    top_left_y_aabb = F.minimum(top_left_y, top_right_y)
    bottom_right_x_aabb = F.maximum(top_right_x, bottom_right_x)
    bottom_right_y_aabb = F.maximum(bottom_left_y, bottom_right_y)

    return top_left_y_aabb, top_left_x_aabb, bottom_right_y_aabb, bottom_right_x_aabb
Esempio n. 28
0
    def _perform_gradient_step(self, iterator):
        batch = iterator.next()
        s_current, action, r, s_next, non_terminal = \
            concat_examples(batch, device=self._device)
        r = F.reshape(r, shape=(*r.shape, 1))
        non_terminal = F.reshape(non_terminal, shape=(*non_terminal.shape, 1))

        pi_action, log_pi = self._pi.action_with_log_pi(s_current)
        log_pi = F.reshape(log_pi, shape=(*log_pi.shape, 1))

        q1 = self._q1(s_current, pi_action)
        q2 = self._q2(s_current, pi_action)
        min_q = F.minimum(q1, q2)

        v_target = min_q - log_pi
        v_target.unchain()
        v = self._v(s_current)

        # update v
        v_loss = 0.5 * F.mean_squared_error(v_target, v)
        self._v_optimizer.target.cleargrads()
        v_loss.backward()
        self._v_optimizer.update()

        # update pi
        # Original implementation uses q1 as target instead of min_q
        pi_loss = F.mean(log_pi - q1)
        self._pi_optimizer.target.cleargrads()
        pi_loss.backward()
        self._pi_optimizer.update()

        v_loss.unchain_backward()
        pi_loss.unchain_backward()

        # update q functions
        q_target = r + self._gamma * non_terminal * self._v_target(s_next)
        q_target.unchain()
        q1 = self._q1(s_current, action)
        q2 = self._q2(s_current, action)
        q1_loss = 0.5 * F.mean_squared_error(q_target, q1)
        q2_loss = 0.5 * F.mean_squared_error(q_target, q2)
        q_loss = q1_loss + q2_loss

        self._q1_optimizer.target.cleargrads()
        self._q2_optimizer.target.cleargrads()
        q_loss.backward()
        q_loss.unchain_backward()
        self._q1_optimizer.update()
        self._q2_optimizer.update()

        self._update_target_network(self._v_target, self._v, self._tau)
Esempio n. 29
0
File: sac.py Progetto: toy101/DSAC
    def update_q_func(self, batch):
        """Compute loss for a given Q-function."""

        batch_next_state = batch['next_state']
        batch_rewards = batch['reward']
        batch_terminal = batch['is_state_terminal']
        batch_state = batch['state']
        batch_actions = batch['action']
        batch_discount = batch['discount']

        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            next_action_distrib = self.policy(batch_next_state)
            next_actions, next_log_prob =\
                next_action_distrib.sample_with_log_prob()
            entropy_term = self.temperature * next_log_prob
            if self.is_discrete:
                next_q1 = F.select_item(self.target_q_func1(batch_next_state),
                                        next_actions)
                next_q2 = F.select_item(self.target_q_func2(batch_next_state),
                                        next_actions)
            else:
                next_q1 = self.target_q_func1(batch_next_state, next_actions)
                next_q2 = self.target_q_func2(batch_next_state, next_actions)
                entropy_term = entropy_term[..., None]
            next_q = F.minimum(next_q1, next_q2)
            assert next_q.shape == entropy_term.shape

            target_q = batch_rewards + batch_discount * \
                (1.0 - batch_terminal) * F.flatten(next_q - entropy_term)

        if self.is_discrete:
            predict_q1 = F.flatten(
                F.select_item(self.q_func1(batch_state), batch_actions))
            predict_q2 = F.flatten(
                F.select_item(self.q_func2(batch_state), batch_actions))
        else:
            predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions))
            predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions))

        loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1)
        loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2)

        # Update stats
        self.q1_record.extend(cuda.to_cpu(predict_q1.array))
        self.q2_record.extend(cuda.to_cpu(predict_q2.array))
        self.q_func1_loss_record.append(float(loss1.array))
        self.q_func2_loss_record.append(float(loss2.array))

        self.q_func1_optimizer.update(lambda: loss1)
        self.q_func2_optimizer.update(lambda: loss2)
Esempio n. 30
0
    def update_q_func(self, batch):
        """Compute loss for a given Q-function."""

        batch_next_state = batch['next_state']
        batch_rewards = batch['reward']
        batch_terminal = batch['is_state_terminal']
        batch_state = batch['state']
        batch_actions = batch['action']
        batch_discount = batch['discount']

        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            next_action_distrib = self.policy(batch_next_state)
            next_actions, next_log_prob =\
                next_action_distrib.sample_with_log_prob()
            next_q1 = self.target_q_func1(batch_next_state, next_actions)
            next_q2 = self.target_q_func2(batch_next_state, next_actions)
            next_q = F.minimum(next_q1, next_q2)
            entropy_term = self.temperature * next_log_prob[..., None]
            assert next_q.shape == entropy_term.shape

            target_q = batch_rewards + batch_discount * \
                (1.0 - batch_terminal) * F.flatten(next_q - entropy_term)

        predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions))
        predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions))

        loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1)
        loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2)

        if self.use_mutual_learning:
            for idx, agent in enumerate(self.all_agents):
                if idx != self.assigned_idx:
                    #self.logger.info('Mutual learn Q')
                    other_predict_q1 = F.flatten(
                        agent.q_func1(batch_state, batch_actions))
                    other_predict_q2 = F.flatten(
                        agent.q_func2(batch_state, batch_actions))
                    loss1 += 0.5 * F.mean_squared_error(
                        predict_q1, other_predict_q1)
                    loss2 += 0.5 * F.mean_squared_error(
                        predict_q2, other_predict_q2)

        # Update stats
        self.q1_record.extend(cuda.to_cpu(predict_q1.array))
        self.q2_record.extend(cuda.to_cpu(predict_q2.array))
        self.q_func1_loss_record.append(float(loss1.array))
        self.q_func2_loss_record.append(float(loss2.array))

        self.q_func1_optimizer.update(lambda: loss1)
        self.q_func2_optimizer.update(lambda: loss2)
Esempio n. 31
0
def clamp(x, lower, upper):
    """ Naive Clamping in [2] A
    [[ x ]]_b = min(max(u,b_lower),b_upper)
    :param x:
    :param lower:
    :param upper:
    :return:
    """
    # Not None
    assert x.shape == lower.shape
    assert x.shape == upper.shape
    assert (lower.array <= upper.array).all(), " lower is larger than upper" \
                                               + " lower: " + str(lower) + "upper: " + str(upper)
    return F.minimum(F.maximum(x, lower), upper)
Esempio n. 32
0
    def calc_loss(self, grids, image_size):
        top_left_x, top_right_x, bottom_left_x, top_left_y, top_right_y, bottom_left_y = self.get_corners(grids, image_size, scale_to_image_size=False)
        # determine whether a point is out of the image, image range is [-1, 1]
        # everything outside of this increases the loss!
        bbox = F.concat([top_left_x, top_left_y, top_right_x, bottom_left_y], axis=0)
        top_loss = bbox + 1
        bottom_loss = bbox - 1

        # do not penalize anything inside the image
        top_loss = F.absolute(F.minimum(top_loss, self.xp.zeros_like(top_loss)))
        bottom_loss = F.maximum(bottom_loss, self.xp.zeros_like(bottom_loss))

        loss = F.sum(F.concat([top_loss, bottom_loss], axis=0))
        return loss
Esempio n. 33
0
    def __call__(self, t, condition):
        # t(timesteps): 1-T

        distribution = chainer.distributions.Normal(
            self.xp.array(0, dtype='f'), self.xp.array(1, dtype='f'))
        z = distribution.sample(t.shape)
        # z(timesteps): 1-T

        condition = self.encoder(condition)
        # condition(timesteps): 1-T

        s_means, s_scales = self.student(z, condition)
        s_clipped_scales = F.maximum(
            s_scales, self.scalar_to_tensor(s_scales, -7))
        # s_means, s_scales(timesteps): 2-(T+1)

        x = z[:, :, 1:] * F.exp(s_scales[:, :, :-1]) + s_means[:, :, :-1]
        # x(timesteps): 2-T

        with chainer.using_config('train', False):
            y = self.teacher(x, condition[:, :, 1:])
        t_means, t_scales = y[:, 1:2], y[:, 2:3]
        t_clipped_scales = F.maximum(
            t_scales, self.scalar_to_tensor(t_scales, -7))
        # t_means, t_scales(timesteps): 3-(T+1)

        s_distribution = chainer.distributions.Normal(
            s_means[:, :, 1:], log_scale=s_clipped_scales[:, :, 1:])
        t_distribution = chainer.distributions.Normal(
            t_means, log_scale=t_clipped_scales)
        # s_distribution, t_distribution(timesteps): 3-(T+1)

        kl = chainer.kl_divergence(s_distribution, t_distribution)
        kl = F.minimum(
            kl, self.scalar_to_tensor(kl, 100))
        kl = F.average(kl)

        regularization = F.mean_squared_error(
            t_scales, s_scales[:, :, 1:])

        spectrogram_frame_loss = F.mean_squared_error(
            self.stft.magnitude(t[:, :, 1:]), self.stft.magnitude(x))

        loss = kl + self.lmd * regularization + spectrogram_frame_loss
        chainer.reporter.report({
            'loss': loss, 'kl_divergence': kl,
            'regularization': regularization,
            'spectrogram_frame_loss': spectrogram_frame_loss}, self)
        return loss
Esempio n. 34
0
 def forward(self, inputs, device):
     x1, x2 = inputs
     return functions.minimum(x1, x2),