Ejemplo n.º 1
0
def min_backward(inputs,
                 axes=None,
                 keep_dims=False,
                 with_index=False,
                 only_index=False):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]
    y0 = get_output(x0, "Min")
    if keep_dims:
        y0 = F.broadcast(y0, x0.shape)
        dy = F.broadcast(dy, x0.shape)
    else:
        axes = [i
                for i in range(x0.ndim)] if axes is None else force_list(axes)
        shape = [1 if i in axes else s for i, s in enumerate(x0.shape)]
        y0 = F.broadcast(F.reshape(y0, shape, inplace=False), x0.shape)
        dy = F.broadcast(F.reshape(dy, shape, inplace=False), x0.shape)
    m0 = F.equal(x0, y0)
    m0 = no_grad(m0)
    dx0 = dy * m0
    if not with_index and not only_index:
        return dx0
    elif with_index:
        return dx0, None
    elif only_index:
        return None
Ejemplo n.º 2
0
def transformer(train=True, droput_ratio=0.1):
    x = nn.Variable((batch_size, max_len))
    t = nn.Variable((batch_size, 1))
    mask = get_mask(x)
    with nn.parameter_scope('embedding_layer'):
        # h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
        h = token_embedding(x, vocab_size, embedding_size)
    h = position_encoding(h)

    if train:
        h = F.dropout(h, p=droput_ratio)

    for i in range(hopping_num):
        with nn.parameter_scope(f'encoder_hopping_{i}'):
            h = residual_normalization_wrapper(multihead_self_attention)(
                h,
                head_num,
                mask=mask,
                train=train,
                dropout_ratio=droput_ratio)
            h = residual_normalization_wrapper(positionwise_feed_forward)(
                h, train=train, dropout_ratio=droput_ratio)

    with nn.parameter_scope('output_layer'):
        y = F.sigmoid(PF.affine(h[:, 0, :], 1))

    accuracy = F.mean(F.equal(F.round(y), t))
    loss = F.mean(F.binary_cross_entropy(y, t))

    return x, y, t, accuracy, loss
Ejemplo n.º 3
0
def build_self_attention_model(train=True):
    x = nn.Variable((batch_size, max_len))
    t = nn.Variable((batch_size, 1))
    mask = get_mask(x)
    attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant(
        np.finfo(np.float32).min, shape=mask.shape)
    with nn.parameter_scope('embedding'):
        h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
    with nn.parameter_scope('forward'):
        h_f = lstm(h,
                   hidden_size,
                   mask=mask,
                   return_sequences=True,
                   return_state=False)
    with nn.parameter_scope('backward'):
        h_b = lstm(h[:, ::-1, ],
                   hidden_size,
                   mask=mask,
                   return_sequences=True,
                   return_state=False)[:, ::-1, ]
    h = F.concatenate(h_f, h_b, axis=2)
    if train:
        h = F.dropout(h, p=dropout_ratio)
    with nn.parameter_scope('da'):
        a = F.tanh(time_distributed(PF.affine)(h, da))
        if train:
            a = F.dropout(a, p=dropout_ratio)
    with nn.parameter_scope('r'):
        a = time_distributed(PF.affine)(a, r)
        if train:
            a = F.dropout(a, p=dropout_ratio)
        a = F.softmax(a + attention_mask, axis=1)
    m = F.batch_matmul(a, h, transpose_a=True)
    with nn.parameter_scope('output_mlp'):
        output = F.relu(PF.affine(m, output_mlp_size))
        if train:
            output = F.dropout(output, p=dropout_ratio)
    with nn.parameter_scope('output'):
        y = F.sigmoid(PF.affine(output, 1))

    accuracy = F.mean(F.equal(F.round(y), t))
    loss = F.mean(F.binary_cross_entropy(
        y, t)) + attention_penalty_coef * frobenius(
            F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r))
    return x, t, accuracy, loss
Ejemplo n.º 4
0
def _nms(heat, kernel=3):
    pad = (kernel - 1) // 2
    hmax = F.max_pooling(heat, (kernel, kernel), stride=(1, 1), pad=(pad, pad))
    keep = F.equal(hmax, heat)
    return heat * keep
Ejemplo n.º 5
0
def global_average_pooling_1d(x, mask):
    count = F.sum(mask, axis=1)
    global_average_pooled = F.sum(h, axis=1) / count
    return global_average_pooled


x = nn.Variable((batch_size, max_len))
t = nn.Variable((batch_size, 1))
mask = get_mask(x)
with nn.parameter_scope('embedding'):
    h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
h = global_average_pooling_1d(h, mask)
with nn.parameter_scope('output'):
    y = F.sigmoid(PF.affine(h, 1))

accuracy = F.mean(F.equal(F.round(y), t))
loss = F.mean(F.binary_cross_entropy(y, t))

# Create solver.
solver = S.Adam()
solver.set_parameters(nn.get_parameters())

trainer = Trainer(inputs=[x, t],
                  loss=loss,
                  metrics={
                      'cross entropy': loss,
                      'accuracy': accuracy
                  },
                  solver=solver)
trainer.run(train_data_iter, dev_data_iter, epochs=5, verbose=1)