Exemple #1
0
def inplace_function_test_helper(inputs, func, func_args=[], func_kwargs={}, ctx=None, rng=None):
    if rng is None:
        rng = np.random.RandomState(313)
    if ctx is None:
        ctx = nn.Context()
    with nn.context_scope(ctx):
        a_s = [inp * 1.0 for inp in inputs]
        y = func(*(a_s + list(func_args)), inplace=False, **func_kwargs)
        l = F.sum(y)
        a_s_i = [inp * 1.0 for inp in inputs]
        y_i = func(*(a_s_i + list(func_args)), inplace=True, **func_kwargs)
        l_i = F.sum(y_i)
    data = [(rng.randn(*inp.shape), rng.randn(*inp.shape)) for inp in inputs]
    for i in range(len(data)):
        inputs[i].d = data[i][0]
        inputs[i].g = data[i][1]
    l.forward()
    l.backward()
    grads = [inp.g.copy() for inp in inputs]
    for i in range(len(data)):
        inputs[i].d = data[i][0]
        inputs[i].g = data[i][1]
    l_i.forward()
    l_i.backward()
    grads_i = [inp.g.copy() for inp in inputs]
    for g, g_i in zip(grads, grads_i):
        assert np.allclose(g, g_i)
def double_backward_for_global(g_dx0, g_db0, g_dg0,
                               dy, x0, b0, g0, rm, rv,
                               axes, decay_rate, eps):
    # Prerequisite
    # axes reduced and denominator
    axes0 = [a for a in range(x0.ndim)]
    axes = list(set(axes0) - set(axes))
    # (variance + eps) * (-1/2)
    v_eps_rsqrt1 = (rv + eps) ** (-1.0 / 2.0)

    # wrt. x
    g_x0 = g_dg0 * dy * v_eps_rsqrt1

    # wrt. beta
    # zero, do nothing

    # wrt. gamma
    g_g0 = F.sum(g_dx0 * dy * v_eps_rsqrt1, axes, True)

    # no backward wrt. rm and rv

    # wrt. dy
    g_dy = g_dx0 * g0 * v_eps_rsqrt1 \
        + g_dg0 * (x0 - rm) * v_eps_rsqrt1 + g_db0

    return g_dy, g_x0, None, g_g0
Exemple #3
0
    def build_train_graph(self, batch):
        self.solver = S.Adam(self.learning_rate)

        obs, action, reward, terminal, newobs = batch
        # Create input variables
        s = nn.Variable(obs.shape)
        a = nn.Variable(action.shape)
        r = nn.Variable(reward.shape)
        t = nn.Variable(terminal.shape)
        snext = nn.Variable(newobs.shape)
        with nn.parameter_scope(self.name_q):
            q = self.q_builder(s, self.num_actions, test=False)
            self.solver.set_parameters(nn.get_parameters())
        with nn.parameter_scope(self.name_qnext):
            qnext = self.q_builder(snext, self.num_actions, test=True)
        qnext.need_grad = False
        clipped_r = F.minimum_scalar(F.maximum_scalar(r, -self.clip_reward),
                                     self.clip_reward)
        q_a = F.sum(q * F.one_hot(F.reshape(a, (-1, 1), inplace=False),
                                  (q.shape[1], )),
                    axis=1)
        target = clipped_r + self.gamma * (1 - t) * F.max(qnext, axis=1)
        loss = F.mean(F.huber_loss(q_a, target))
        Variables = namedtuple('Variables',
                               ['s', 'a', 'r', 't', 'snext', 'q', 'loss'])
        self.v = Variables(s, a, r, t, snext, q, loss)
        self.sync_models()
        self.built = True
Exemple #4
0
def norm_normalization_backward(inputs, p=None, axes=None, eps=1e-12):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]

    if p is None:
        p = 2.0
    axes = list(range(x0.ndim)) if axes is None else force_list(axes)

    x_abs = F.abs(x0)
    x_pow = F.pow_scalar(x_abs, p)
    x_sum = F.sum(x_pow, axes, keepdims=True)
    # x_norm = x_sum ** (1./p)

    # Div2 backward
    dx = dy * x_sum**(-1. / p)
    dx_norm = -dy * x0 * x_sum**(-2. / p)
    dx_norm = sum_for_arithmetics(dx_norm, x_sum)

    # Norm backward
    x_sign = no_grad(F.sign(x0))
    dx += dx_norm * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign

    return dx
Exemple #5
0
def get_model(args,
              num_classes,
              test=False,
              channel_last=False,
              with_error=True):
    """
    Create computation graph and variables.
    """
    nn_in_size = 224
    if channel_last:
        image = nn.Variable([args.batch_size, nn_in_size, nn_in_size, 4])
    else:
        image = nn.Variable([args.batch_size, 4, nn_in_size, nn_in_size])
    label = nn.Variable([args.batch_size, 1])
    pred, hidden = model_resnet_nhwc.resnet_imagenet(image,
                                                     num_classes,
                                                     args.num_layers,
                                                     args.shortcut_type,
                                                     test=test,
                                                     tiny=False,
                                                     channel_last=channel_last)
    pred.persistent = True
    loss = F.mean(loss_function(pred, label, args.label_smoothing))
    error = F.sum(F.top_n_error(pred, label, n=1))
    Model = namedtuple('Model',
                       ['image', 'label', 'pred', 'loss', 'error', 'hidden'])
    return Model(image, label, pred, loss, error, hidden)
Exemple #6
0
def affine_backward(inputs, base_axis=1):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]
    w0 = inputs[2]

    base_axis += inputs[0].ndim * (base_axis < 0)

    ctx = nn.get_current_context()
    dfx = AffineDataGrad(ctx, base_axis)
    dfw = AffineFilterGrad(ctx, base_axis)
    dfx.xshape = x0.shape
    dfw.wshape = w0.shape

    dx0 = dfx(dy, w0)
    dw0 = dfw(dy, x0)

    if len(inputs) == 4:
        axes = [i for i in range(0, base_axis)]
        db0 = F.sum(dy, axes, keepdims=False)
        return dx0, dw0, db0
    else:
        return dx0, dw0
Exemple #7
0
def sigma_regularization(ctx, log_var, one):
    with nn.context_scope(ctx):
        h = F.exp(log_var)
        h = F.pow_scalar(h, 0.5)
        b = log_var.shape[0]
        r = F.sum(F.squared_error(h, one)) / b
    return r
Exemple #8
0
def disparityregression(x, maxdisp):
    disp = nn.Variable((x.shape), need_grad=False)
    for i in range(0, maxdisp):
        disp.d[:, :, i, :, :] = i
    dispx = F.mul2(disp, x)
    out = F.sum(dispx, axis=2)
    return out
Exemple #9
0
def norm_backward(inputs, p=None, axes=None, keep_dims=False):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]

    if p is None:
        p = 2.0
    axes = list(range(x0.ndim)) if axes is None else force_list(axes)

    x_abs = F.abs(x0)
    x_pow = F.pow_scalar(x_abs, p)
    x_sum = F.sum(x_pow, axes, keepdims=True)

    # Add axis for mul2
    if not keep_dims:
        shape = list(x0.shape)
        for a in axes:
            shape[a] = 1
        dy = dy.reshape(shape)

    x_sign = no_grad(F.sign(x0))
    dx = dy * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign

    return dx
Exemple #10
0
def softmax_cross_entropy_with_label_smoothing(pred,
                                               label,
                                               label_smoothing=0.1):
    '''
    Defines softmax activation followed by Cross entropy loss and label smoothing.


    Label smoothing loss is added by the following weight:
    `(1 - label_smoothing) * xent_loss + label_smoothing * label_smoothing_loss`

    Args:
        pred (Variable): Logits with a shape of `(batch_size, num_classes)`.
        label (Variable):
            A class index for each example if a shape of `(batch_size, 1`) is
            given, and a one-hot or probability over classes if
            `(batch_size, num_classes)`.
        label_smoothing (float):
            Coefficient of label smoothing loss. If 0, it omits label
            smoothing.


    '''
    logp = None
    if label.shape[1] > 1:
        # If mixup is enabled, we suppose the label shape is (batch_size, num_class)
        logp = F.log_softmax(pred)
        l = F.sum(-label * logp, axis=1, keepdims=True)
    else:
        l = F.softmax_cross_entropy(pred, label)
    return apply_label_smoothing(l, pred, label_smoothing, logp)
Exemple #11
0
def sigma_regularization(ctx, log_var, one):
    with nn.context_scope(ctx):
        h = F.exp(log_var)
        h = F.pow_scalar(h, 0.5)
        b = log_var.shape[0]
        r = F.sum(F.squared_error(h, one)) / b
    return r
Exemple #12
0
    def backward_impl(self, inputs, outputs, prop_down, accum):
        # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or
        # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph]

        # Args
        axes = self.forward_func.info.args["axes"]
        keep_dims = self.forward_func.info.args["keep_dims"]

        # Inputs
        x0 = inputs[0].data
        dy = inputs[1].data
        # Outputs
        dx0 = outputs[0].data
        # Grads of inputs
        g_x0 = inputs[0].grad
        g_dy = inputs[1].grad
        # Grads of outputs
        g_dx0 = outputs[0].grad

        # Computation
        if prop_down[1]:
            g_dy_ = F.sum(g_dx0, axes, keep_dims)
            if accum[1]:
                g_dy += g_dy_
            else:
                g_dy.copy_from(g_dy_)
Exemple #13
0
    def backward_impl(self, inputs, outputs, prop_down, accum):
        # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or
        # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph]

        # Args
        axes = self.forward_func.info.args["axes"]
        # Inputs
        x0 = inputs[0].data
        dy = inputs[1].data
        # Outputs
        dx0 = outputs[0].data
        # Grads of inputs
        g_x0 = inputs[0].grad
        g_dy = inputs[1].grad
        # Grads of outputs
        g_dx0 = outputs[0].grad

        # Compute
        # TODO: Optimize by creating max_pooling with indeces
        if prop_down[1]:
            # dx0 is not accumulated on the backward graph
            mask = F.not_equal_scalar(dx0, 0.0)
            g_dy_ = F.sum(g_dx0 * mask, axes)
            if accum[1]:
                g_dy += g_dy_
            else:
                g_dy.copy_from(g_dy_)
Exemple #14
0
 def jacobian(self, coordinates):
     new_coordinates = self.warp_coordinates(coordinates)
     new_coordinates_x = F.slice(new_coordinates,
                                 start=(0, 0, 0),
                                 stop=new_coordinates.shape[:2] + (1, ))
     grad_x = nn.grad([F.sum(new_coordinates_x)], [coordinates])
     new_coordinates_y = F.slice(new_coordinates,
                                 start=(0, 0, 1),
                                 stop=new_coordinates.shape[:2] + (2, ))
     grad_y = nn.grad([F.sum(new_coordinates_y)], [coordinates])
     gx = F.reshape(grad_x[0],
                    grad_x[0].shape[:-1] + (1, ) + grad_x[0].shape[-1:])
     gy = F.reshape(grad_y[0],
                    grad_y[0].shape[:-1] + (1, ) + grad_y[0].shape[-1:])
     jacobian = F.concatenate(gx, gy, axis=gy.ndim - 2)
     return jacobian
Exemple #15
0
def kl_divergence(ctx, pred, label, log_var):
    with nn.context_scope(ctx):
        s = F.pow_scalar(F.exp(log_var), 0.5)
        elms = softmax_with_temperature(ctx, label, s) \
               * F.log(F.softmax(pred, axis=1))
        loss = -F.mean(F.sum(elms, axis=1))
    return loss
Exemple #16
0
def siamese_loss(e0, e1, t, margin=1.0, eps=1e-4):
    dist = F.sum(F.squared_error(e0, e1), axis=1)  # Squared distance
    # Contrastive loss
    sim_cost = t * dist
    dissim_cost = (1 - t) * (F.maximum_scalar(margin -
                                              (dist + eps)**(0.5), 0)**2)
    return F.mean(sim_cost + dissim_cost)
Exemple #17
0
def build_model():
    x = nn.Variable((batch_size, sentence_length_source))
    input_mask = F.sign(
        F.reshape(F.slice(x), (batch_size, sentence_length_source, 1)))
    y = nn.Variable((batch_size, sentence_length_target))

    enc_input = time_distributed(PF.embed)(x,
                                           vocab_size_source,
                                           embedding_size,
                                           name='enc_embeddings')  #*input_mask
    # -> (batch_size, sentence_length_source, embedding_size)
    dec_input = time_distributed(PF.embed)(y,
                                           vocab_size_target,
                                           embedding_size,
                                           name='dec_embeddings')
    # -> (batch_size, sentence_length_target, embedding_size)

    # encoder
    with nn.parameter_scope('encoder'):
        output, c, h = LSTMEncoder(enc_input,
                                   hidden,
                                   return_sequences=True,
                                   return_state=True)
        # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden)

    # decoder
    output = LSTMAttentionDecoder(dec_input,
                                  output,
                                  initial_state=(c, h),
                                  return_sequences=True,
                                  name='decoder')
    # -> (batch_size, sentence_length_target, hidden)
    output = time_distributed(PF.affine)(output,
                                         vocab_size_target,
                                         name='output')
    # -> (batch_size, sentence_length_target, vocab_size_target)

    t = F.reshape(F.slice(y), (batch_size, sentence_length_target, 1))

    entropy = time_distributed_softmax_cross_entropy(output, t)

    mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
    count = F.sum(mask, axis=1)

    entropy *= mask
    loss = F.mean(F.sum(entropy, axis=1) / count)
    return x, y, loss
Exemple #18
0
def mnist_lenet_siamese(x0, x1, test=False):
    """"""
    h0 = mnist_lenet_feature(x0, test)
    h1 = mnist_lenet_feature(x1, test)  # share weights
    # h = (h0 - h1) ** 2 # equivalent
    h = F.squared_error(h0, h1)
    p = F.sum(h, axis=1)
    return p
Exemple #19
0
def mnist_lenet_siamese(x0, x1, test=False):
    """"""
    h0 = mnist_lenet_feature(x0, test)
    h1 = mnist_lenet_feature(x1, test)  # share weights
    # h = (h0 - h1) ** 2 # equivalent
    h = F.squared_error(h0, h1)
    p = F.sum(h, axis=1)
    return p
Exemple #20
0
def deconvolution_backward(inputs,
                           base_axis=1,
                           pad=None,
                           stride=None,
                           dilation=None,
                           group=1,
                           channel_last=False,
                           output_padding=None):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]
    w0 = inputs[2]

    base_axis += x0.ndim * (base_axis < 0)
    # base_axis += inputs[0].ndim*(base_axis < 0)

    ctx = nn.get_current_context()
    dfx = DeconvolutionDataGrad(ctx, base_axis, pad, stride, dilation, group,
                                channel_last, output_padding)
    dfw = DeconvolutionFilterGrad(ctx, base_axis, pad, stride, dilation, group,
                                  channel_last, output_padding)
    dfx.xshape = x0.shape
    dfw.wshape = w0.shape

    dx0 = dfx(dy, w0)
    dw0 = dfw(dy, x0)
    axes = [i for i in range(dy.ndim, base_axis)]
    db0 = F.sum(dy, axes, keepdims=False) if len(inputs) == 4 else None

    if len(inputs) == 4:
        if channel_last:
            axes = [i for i in range(dy.ndim - 1)]
        else:
            axes = [i for i in range(0, base_axis)] + \
                                     [i for i in range(base_axis + 1, dy.ndim)]
        db0 = F.sum(dy, axes, keepdims=False) if len(inputs) == 4 else None
        return dx0, dw0, db0
    else:
        return dx0, dw0
Exemple #21
0
def sum(xs, shape=None, axis=None):
    assert (len(xs) > 0 or shape is not None)
    if len(xs) == 0:
        x = nn.Variable(shape)
        x.data.data = 0
        return x
    else:
        return F.sum(stack(xs), axis=axis)
Exemple #22
0
    def backward_impl(self, inputs, outputs, prop_down, accum):
        # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or
        # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph]

        axis = self.forward_func.info.args["axis"]

        # Inputs
        x0 = inputs[0].data
        y0 = inputs[1].data
        dy = inputs[2].data
        # Outputs
        dx0 = outputs[0].data
        # Grads of inputs
        g_x0 = inputs[0].grad
        g_y0 = inputs[1].grad
        g_dy = inputs[2].grad
        # Grads of outputs
        g_dx0 = outputs[0].grad

        # w.r.t. x0
        if prop_down[0]:
            gdx_y = g_dx0 * y0
            gdx_dy_y = gdx_y * dy
            dy_y = dy * y0
            gdx_y_sum = F.sum(gdx_y, axis, True)
            dy_y_sum = F.sum(dy_y, axis, True)
            gdx_dy_y_sum = F.sum(gdx_dy_y, axis, True)

            t1 = gdx_dy_y
            t2 = y0 * gdx_dy_y_sum
            t3 = gdx_y_sum * (y0 * dy_y_sum - dy_y)
            t4 = dy_y_sum * (y0 * gdx_y_sum - gdx_y)

            g_x0_ = t1 - t2 + t3 + t4
            if accum[0]:
                g_x0 += g_x0_
            else:
                g_x0.copy_from(g_x0_)

        # w.r.t. dy
        if prop_down[2]:
            si = nn.Variable(x0.shape).apply(data=x0,
                                             grad=g_dy,
                                             need_grad=True)
            so = nn.Variable(dx0.shape).apply(data=y0, grad=g_dx0)
            self.forward_func.backward([si], [so], accum=[accum[2]])
    def backward_impl(self, inputs, outputs, prop_down, accum):
        # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or
        # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph]

        # Args
        axis = self.forward_func.info.args["axis"]
        # Inputs
        x0 = inputs[0].data  # logits
        t0 = inputs[1].data  # labels
        dz = inputs[2].data  # grad_input
        # Outputs
        dx0 = outputs[0].data
        dt0 = outputs[1].data
        # Grads of inputs
        g_x0 = inputs[0].grad
        g_t0 = inputs[1].grad
        g_dz = inputs[2].grad
        # Grads of outputs
        g_dx0 = outputs[0].grad
        g_dt0 = outputs[1].grad

        # Computation
        ## w.r.t. x0
        if prop_down[0]:
            # gradient is the backward of softmax with (g_dx0 * dz) as in-coming gradient
            si = nn.Variable(x0.shape).apply(data=x0, need_grad=True)
            si.grad.fill(0.0)
            so = F.softmax(si, axis)
            if not nn.get_auto_forward():
                so.forward()
            so.backward(g_dx0 * dz, clear_buffer=False)
            g_x0_ = si.grad
            if accum[0]:
                g_x0 += g_x0_
            else:
                g_x0.copy_from(g_x0_)

        ## w.r.t. t0 is not required

        ## w.r.t. dz
        if prop_down[2]:
            # Instable implementation since using `/ dz`
            ## g_dz_ = g_dx0 * dx0 / dz
            ## g_dz_ = F.sum(g_dz_, axis)

            shape = dz.shape if dz.shape != [] else [1]
            si = nn.Variable(x0.shape).apply(data=x0, need_grad=True)
            ti = nn.Variable(t0.shape).apply(data=t0)
            o = nn.Variable(shape)
            o.grad.fill(1.0)
            self.forward_func.backward([si, ti], [o], [False, False])

            # Sum g_dx0_i * (y_hat_i - y_i) over i
            g_dz_ = F.sum(g_dx0 * si.grad, axis)
            if accum[2]:
                g_dz += g_dz_
            else:
                g_dz.copy_from(g_dz_)
Exemple #24
0
def detect_keypoint(x, block_expansion, num_kp, num_channels, max_features,
                    num_blocks, temperature, estimate_jacobian=False, scale_factor=1,
                    single_jacobian_map=False, pad=0,
                    test=False, comm=None):

    if scale_factor != 1:
        x = anti_alias_interpolate(x, num_channels, scale_factor)

    with nn.parameter_scope("hourglass"):
        feature_map = hourglass(x, block_expansion, num_blocks=num_blocks,
                                max_features=max_features, test=test, comm=comm)

    with nn.parameter_scope("keypoint_detector"):
        inmaps, outmaps = feature_map.shape[1], num_kp
        k_w = I.calc_normal_std_he_forward(
            inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.)
        k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
        w_init = I.UniformInitializer((-k_w, k_w))
        b_init = I.UniformInitializer((-k_b, k_b))
        prediction = PF.convolution(feature_map, outmaps=num_kp,
                                    kernel=(7, 7), pad=(pad, pad),
                                    w_init=w_init, b_init=b_init)

    final_shape = prediction.shape

    heatmap = F.reshape(prediction, (final_shape[0], final_shape[1], -1))
    heatmap = F.softmax(heatmap / temperature, axis=2)
    heatmap = F.reshape(heatmap, final_shape, inplace=False)

    out = gaussian2kp(heatmap)  # {"value": value}, keypoint positions.

    if estimate_jacobian:
        if single_jacobian_map:
            num_jacobian_maps = 1
        else:
            num_jacobian_maps = num_kp

        with nn.parameter_scope("jacobian_estimator"):
            jacobian_map = PF.convolution(feature_map,
                                          outmaps=4*num_jacobian_maps,
                                          kernel=(7, 7), pad=(pad, pad),
                                          w_init=I.ConstantInitializer(0),
                                          b_init=np.array([1, 0, 0, 1]*num_jacobian_maps))

        jacobian_map = F.reshape(
            jacobian_map, (final_shape[0], num_jacobian_maps, 4, final_shape[2], final_shape[3]))
        heatmap = F.reshape(
            heatmap, heatmap.shape[:2] + (1,) + heatmap.shape[2:], inplace=False)

        jacobian = heatmap * jacobian_map
        jacobian = F.sum(jacobian, axis=(3, 4))
        jacobian = F.reshape(
            jacobian, (jacobian.shape[0], jacobian.shape[1], 2, 2), inplace=False)
        out['jacobian'] = jacobian  # jacobian near each keypoint.

    # out is a dictionary containing {"value": value, "jacobian": jacobian}

    return out
Exemple #25
0
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1):
    #TODO: squared error/absolute error
    s0 = F.exp(log_var0)
    s1 = F.exp(log_var1)
    squared_error = F.squared_error(pred0, pred1)
    b = pred0.shape[0]
    with nn.context_scope(ctx):
        loss_sr = F.sum(squared_error * (1 / s0 + 1 / s1) + (s0 / s1 + s1 / s0)) * 0.5 / b
    return loss_sr
Exemple #26
0
def er_loss(ctx, pred):
    with nn.context_scope(ctx):
        bs = pred.shape[0]
        d = np.prod(pred.shape[1:])
        denominator = bs * d
        pred_normalized = F.softmax(pred)
        pred_log_normalized = F.log(F.softmax(pred))
        loss_er = -F.sum(pred_normalized * pred_log_normalized) / denominator
    return loss_er
Exemple #27
0
def sum_grad_norm(params):
    norm = nn.NdArray()
    norm.zero()

    for p in params:
        assert isinstance(p, nn.Variable) and not p.grad.clear_called
        norm += F.sum(p.grad**2)

    return np.sqrt(norm.data)
Exemple #28
0
def er_loss(ctx, pred):
    with nn.context_scope(ctx):
        bs = pred.shape[0]
        d = np.prod(pred.shape[1:])
        denominator = bs * d
        pred_normalized = F.softmax(pred)
        pred_log_normalized = F.log(F.softmax(pred))
        loss_er = - F.sum(pred_normalized * pred_log_normalized) / denominator
    return loss_er
Exemple #29
0
    def _build(self):
        # infer variable
        self.infer_obs_t = nn.Variable((1, 4, 84, 84))
        # inference output
        self.infer_q_t = self.q_function(self.infer_obs_t,
                                         self.num_actions,
                                         scope='q_func')

        # train variables
        self.obss_t = nn.Variable((self.batch_size, 4, 84, 84))
        self.acts_t = nn.Variable((self.batch_size, 1))
        self.rews_tp1 = nn.Variable((self.batch_size, 1))
        self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84))
        self.ters_tp1 = nn.Variable((self.batch_size, 1))
        self.weights = nn.Variable((self.batch_size, 1))

        # training output
        q_t = self.q_function(self.obss_t, self.num_actions, scope='q_func')
        q_tp1 = self.q_function(self.obss_tp1,
                                self.num_actions,
                                scope='target_q_func')

        # select one dimension
        a_t_one_hot = F.one_hot(self.acts_t, (self.num_actions, ))
        q_t_selected = F.sum(q_t * a_t_one_hot, axis=1, keepdims=True)
        q_tp1_best = F.max(q_tp1, axis=1, keepdims=True)

        # loss calculation
        y = self.rews_tp1 + self.gamma * q_tp1_best * (1.0 - self.ters_tp1)
        self.td = q_t_selected - y
        self.loss = F.sum(F.huber_loss(q_t_selected, y) * self.weights)
        self.loss_sink = F.sink(self.td, self.loss)

        # optimizer
        self.solver = S.RMSprop(self.lr, 0.95, 1e-2)

        # weights and biases
        with nn.parameter_scope('q_func'):
            self.params = nn.get_parameters()
        with nn.parameter_scope('target_q_func'):
            self.target_params = nn.get_parameters()

        # set q function parameters to solver
        self.solver.set_parameters(self.params)
Exemple #30
0
def _spectral_norm_outer_most_dim_backward(dw_sn, w, u, itr=1, eps=1e-12):
    # Forward recomputation

    w_shape = w.shape
    d0 = np.prod(w.shape[0:-1])  # In
    d1 = w.shape[-1]             # Out
    w = F.reshape(w, [d0, d1])
    u = F.reshape(u, [d1, 1])
    # Power method
    for _ in range(itr):
        # v
        v = F.affine(w, u)
        v = v / ((F.sum(v ** 2.0, keepdims=True) + eps) ** 0.5)
        v = F.reshape(v, [1, d0])
        # u
        u = F.affine(v, w)
        u = u / ((F.sum(u ** 2.0, keepdims=True) + eps) ** 0.5)
        u = F.reshape(u, [d1, 1])
    # No grad
    u = no_grad(u)
    v = no_grad(v)
    # Spectral normalization
    vw = F.affine(v, w)
    sigma = F.affine(vw, u)
    w_sn = w / sigma
    # The fowllowing process is not necessary for gradient calculation
    # w_sn = F.reshape(w_sn, w_shape)

    # Backward for spectral norm
    dw_sn = dw_sn.reshape(w.shape)
    # Sum for broadcast backward
    S = sum_for_arithmetics(dw_sn * w_sn, sigma)
    # Add batch axis
    S = S.reshape((1,) + S.shape)
    u = u.reshape((1,) + u.shape)
    v = v.reshape((1,) + v.shape)
    m = F.batch_matmul(v, S, transpose_a=True)
    m = F.batch_matmul(m, u, transpose_b=True)
    # Remove batch axis
    m = m.reshape((m.shape[1], m.shape[2]))
    dw = (dw_sn - m) / sigma
    dw = dw.reshape(w_shape)

    return dw, None
Exemple #31
0
 def mask_weight(a, b):
     # much different from definition in the paper
     merged_mask = F.concatenate(a, b, axis=1)
     summed_mask = F.sum((merged_mask + 1) / 2, axis=1, keepdims=True)
     clipped = F.clip_by_value(summed_mask,
                               F.constant(0, shape=summed_mask.shape),
                               F.constant(1, shape=summed_mask.shape))
     z = clipped * 2 - 1
     mask = (1 - z) / 2
     return mask
Exemple #32
0
    def call(self, input):
        if self._mode == 'full':
            out = F.stack(*[op(input) for op in self._ops], axis=0)
            out = F.mul2(out, F.softmax(self._alpha, axis=0))
            return F.sum(out, axis=0)

        # update active index
        self._update_active_index()

        return self._ops[self._active](input)
Exemple #33
0
def _calc_gradient_penalty(real, fake, discriminator):
    alpha = F.rand(shape=(1, 1, 1, 1))
    interpolates = alpha * real + (1.0 - alpha) * fake
    interpolates.need_grad = True

    disc_interpolates = discriminator(x=interpolates)

    grads = nn.grad([disc_interpolates], [interpolates])
    norms = [F.sum(g ** 2.0, axis=1) ** 0.5 for g in grads]
    return sum([F.mean((norm - 1.0) ** 2.0) for norm in norms])
Exemple #34
0
def gaussian2kp(heatmap):
    shape = heatmap.shape
    heatmap = F.reshape(heatmap, shape + (1, ), inplace=False)
    grid = make_coordinate_grid(shape[2:])
    grid = F.reshape(grid, (1, 1) + grid.shape)
    value = F.sum(heatmap * grid, axis=(2, 3))

    kp = {'value': value}

    return kp
Exemple #35
0
        def callback(f):
            # For the first time to check this function.
            self._add_key(f, self.key_to_stat_bwd)

            # apply callback to check the outputs of this function has nan values or not.
            nan = []
            if self.trace_nan:
                nan = [F.sum(F.isnan(i.grad)) for i in f.inputs]

            inf = []
            if self.trace_inf:
                inf = [F.sum(F.isinf(i.grad)) for i in f.inputs]

            self.key_to_stat_bwd[f].update({
                "inf": inf,
                "nan": nan,
                # rank might be changed between each iteration.
                "rank": f.rank,
            })
Exemple #36
0
def kl_divergence(ctx, pred, label):
    with nn.context_scope(ctx):
        elms = F.softmax(label, axis=1) * F.log(F.softmax(pred, axis=1))
        loss = -F.mean(F.sum(elms, axis=1))
    return loss
Exemple #37
0
def ce_loss_soft(ctx, pred, target):
    with nn.context_scope(ctx):
        #todo: devide or not
        loss = - F.mean(F.sum(F.softmax(target) * F.log(F.softmax(pred)), axis=1))
    return loss
Exemple #38
0
def main():

    # Get arguments
    args = get_args()
    data_file = "https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt"
    model_file = args.work_dir + "model.h5"

    # Load Dataset
    itow, wtoi, dataset = load_ptbset(data_file)

    # Computation environment settings
    from nnabla.contrib.context import extension_context
    extension_module = args.context
    if args.context is None:
        extension_module = 'cpu'
    logger.info("Running in %s" % extension_module)
    ctx = extension_context(extension_module, device_id=args.device_id)
    nn.set_default_context(ctx)

    # Create data provider
    n_word = len(wtoi)
    n_dim = args.embed_dim
    batchsize = args.batchsize
    half_window = args.half_window_length
    n_negative = args.n_negative_sample

    di = DataIteratorForEmbeddingLearning(
        batchsize=batchsize,
        half_window=half_window,
        n_negative=n_negative,
        dataset=dataset)

    # Create model
    # - Real batch size including context samples and negative samples
    size = batchsize * (1 + n_negative) * (2 * (half_window - 1))

    # Model for learning
    # - input variables
    xl = nn.Variable((size,))  # variable for word
    yl = nn.Variable((size,))  # variable for context

    # Embed layers for word embedding function
    # - f_embed : word index x to get y, the n_dim vector
    # --  for each sample in a minibatch
    hx = PF.embed(xl, n_word, n_dim, name="e1")  # feature vector for word
    hy = PF.embed(yl, n_word, n_dim, name="e1")  # feature vector for context
    hl = F.sum(hx * hy, axis=1)

    # -- Approximated likelihood of context prediction
    # pos: word context, neg negative samples
    tl = nn.Variable([size, ], need_grad=False)
    loss = F.sigmoid_cross_entropy(hl, tl)
    loss = F.mean(loss)

    # Model for test of searching similar words
    xr = nn.Variable((1,), need_grad=False)
    hr = PF.embed(xr, n_word, n_dim, name="e1")  # feature vector for test

    # Create solver
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Create monitor.
    monitor = M.Monitor(args.work_dir)
    monitor_loss = M.MonitorSeries(
        "Training loss", monitor, interval=args.monitor_interval)
    monitor_time = M.MonitorTimeElapsed(
        "Training time", monitor, interval=args.monitor_interval)

    # Do training
    max_epoch = args.max_epoch
    for epoch in range(max_epoch):

        # iteration per epoch
        for i in range(di.n_batch):

            # get minibatch
            xi, yi, ti = di.next()

            # learn
            solver.zero_grad()
            xl.d, yl.d, tl.d = xi, yi, ti
            loss.forward(clear_no_need_grad=True)
            loss.backward(clear_buffer=True)
            solver.update()

            # monitor
            itr = epoch * di.n_batch + i
            monitor_loss.add(itr, loss.d)
            monitor_time.add(itr)

    # Save model
    nn.save_parameters(model_file)

    # Evaluate by similarity
    max_check_words = args.max_check_words
    for i in range(max_check_words):

        # prediction
        xr.d = i
        hr.forward(clear_buffer=True)
        h = hr.d

        # similarity calculation
        w = nn.get_parameters()['e1/embed/W'].d
        s = np.sqrt((w * w).sum(1))
        w /= s.reshape((s.shape[0], 1))
        similarity = w.dot(h[0]) / s[i]

        # for understanding
        output_similar_words(itow, i, similarity)