Example #1
0
def bilinear_kernel(in_channels, out_channels, kernel_size):
    factor = (kernel_size + 1) // 2
    if kernel_size % 2 == 1:
        center = factor - 1
    else:
        center = factor - 0.5
    og = (np.arange(kernel_size).reshape(-1, 1),
          np.arange(kernel_size).reshape(1, -1))
    filt = (1 - np.abs(og[0] - center) / factor) * \
           (1 - np.abs(og[1] - center) / factor)
    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size))
    weight[range(in_channels), range(out_channels), :, :] = filt
    return np.array(weight)
Example #2
0
 def forward(self, X):
     X = self.dense(X)
     X = npx.relu(np.dot(X, self.rand_weight.data()) + 1)
     X = self.dense(X)
     while np.abs(X).sum() > 1:
         X /= 2
     return X.sum()
Example #3
0
def relative_position_bucket(relative_position,
                             bidirectional: bool = True,
                             num_buckets: int = 32,
                             max_distance: int = 128):
    """Map the relative position to buckets. The implementation is consistent with that
    in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637)
    where relative position is defined as `mem_i - query_j`. Thus, a positive value indicates 
    that the memory slot is in a later timestamp than the query slot. 

    After handling the bidirectional case (see below), the implementation uses the first half 
    of buckets to store exact differences and the second half to store the differences after 
    a logrithmic transformation. 

    Parameters
    ----------
    relative_position
        Shape (...,)
    bidirectional
        Whether we are dealing with bidirectional attention.
        If it's bidirectional, positive shifts are mappd to [0, num_buckets // 2), 
        and negative shifts are mapped to [num_buckets // 2, num_buckets). 
    num_buckets
        The number of buckets.
    max_distance
        Maximum distance. Positions that fall outside of 'max_distance' will be trimmed.

    Returns
    -------
    buckets
        Shape (...,).
        It has the same shape as the `relative_position`. It will have int32 type.
    """
    ret = 0
    relative_position = -relative_position
    if bidirectional:
        assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \
                                     'divisible by 2.'
        num_buckets //= 2
        ret = ret + (relative_position < 0).astype(np.int32) * num_buckets
        relative_position = np.abs(relative_position)
    else:
        # Clip all the negative values to 0
        relative_position = np.clip(relative_position, a_min=0, a_max=None)
    # Now, the relative_position is in the range [0, inf)

    # Half of the buckets deal with the exact increments,
    # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2
    max_exact = num_buckets // 2
    is_small = relative_position < max_exact

    # The other half of the buckets are for logarithmically bigger bins in positions up to
    # max_distance
    val_if_large = max_exact + (
        np.log(relative_position.astype(np.float32) / max_exact) /
        math.log(max_distance / max_exact) *
        (num_buckets - max_exact)).astype(np.int32)
    val_if_large = np.minimum(val_if_large, num_buckets - 1)
    ret = ret + np.where(is_small, relative_position, val_if_large)
    return ret
def test_abs():
    A = np.ones((INT_OVERFLOW, 2))
    A[0][0] = -1
    A.attach_grad()
    with mx.autograd.record():
        B = np.abs(A)
    assert B.shape == (INT_OVERFLOW, 2)
    assert B[0][0] == 1
    B.backward()
    assert A.grad.shape == (INT_OVERFLOW, 2)
    assert A.grad[0][0] == -1
Example #5
0
def test_abs():
    # abs absolute and fabs are the same thing
    inp = np.zeros((INT_OVERFLOW, 2))
    inp[-1, -1] = -1
    inp.attach_grad()
    with mx.autograd.record():
        out = np.abs(inp)
        out.backward()
    assert out.shape == (INT_OVERFLOW, 2)
    assert out[-1, -1] == 1
    assert inp.grad.shape == (INT_OVERFLOW, 2)
    assert inp.grad[-1, -1] == -1
def test_contrib_intgemm_maxabsolute(shape):
    if "intgemm_maxabsolute" not in dir(mx.nd.contrib):
        return
    # mx.nd API
    m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
    fast = mx.nd.contrib.intgemm_maxabsolute(m)
    slow = mx.nd.max(mx.nd.abs(m))
    assert same(fast, slow)
    # np API
    m = np.random.uniform(low=-100.0, high=100.0, size=shape)
    fast = npx.intgemm_maxabsolute(m).reshape(())
    slow = np.max(np.abs(m))
    assert same(fast, slow)
Example #7
0
def train(lambd):
    # w and b has been 'grad attached'
    w, b = init_params()
    net, loss = lambda X: d2l_dx.linreg(X, w, b), d2l_dx.squared_loss
    num_epochs, lr = 100, 0.003

    for epoch in range(1, num_epochs + 1):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y) + lambd * l2_penalty(w)
            l.backward()
            d2l_dx.sgd([w, b], lr, batch_size)
        if epoch % 5 == 0:
            train_loss = d2l_dx.evaluate_loss(net, data_iter=train_iter, loss=loss)
            test_loss = d2l_dx.evaluate_loss(net, data_iter=test_iter, loss=loss)
            print("epochs: {}, training loss: {}, test loss: {}".format(epoch, train_loss, test_loss))
    print('l1 norm of w', np.abs(w).sum())
Example #8
0
def train_gluon(wd):
    net = gluon.nn.Sequential()
    net.add(gluon.nn.Dense(1))
    net.initialize(init.Normal(sigma=1))
    loss = gluon.loss.L2Loss()
    num_epochs, lr = 100, 0.003

    trainer_w = gluon.Trainer(net.collect_params('.*weight'), 'sgd', {'learning_rate': lr, 'wd': wd})
    trainer_b = gluon.Trainer(net.collect_params('.*bias'), 'sgd', {'learning_rate': lr})

    for epoch in range(1, num_epochs + 1):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer_w.step(batch_size)
            trainer_b.step(batch_size)
        if epoch % 5 == 0:
            train_loss = d2l_dx.evaluate_loss(net, data_iter=train_iter, loss=loss)
            test_loss = d2l_dx.evaluate_loss(net, data_iter=test_iter, loss=loss)
            print("epochs: {}, training loss: {}, test loss: {}".format(epoch, train_loss, test_loss))
    print('L1 norm of w: ', np.abs(net[0].weight.data()).sum())
Example #9
0
def norms():
    u = np.array([3, -4])
    # l_2 (euclidean)
    np.linalg.norm(u)
    # l_1
    np.abs(u).sum()
Example #10
0
def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
    return float((np.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())
Example #11
0
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape
    c_o = K.shape[0]
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, c_i))
    Y = np.dot(K, X)  # Matrix multiplication in the fully-connected layer
    return Y.reshape((c_o, h, w))

X = np.random.normal(0, 1, (3, 3, 3))
K = np.random.normal(0, 1, (2, 3, 1, 1))


Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)
assert float(np.abs(Y1 - Y2).sum()) < 1e-6


# Exercise 6
def corr2d_multi_in_out_2x2(X, K):
    c_i, h, w = X.shape
    c_o, c_ii, kh, kw = K.shape
    assert c_ii == c_i, "Kernel channel dimensions don't match input"
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, kh * kw, c_i))
    Y = np.dot(K, X)  # Matrix multiplication in the fully-connected layer
    Y = Y.sum(axis=2) # input channel dimension
    return Y.reshape((c_o, kh, kw))


K = np.random.normal(0, 1, (2, 3, 2, 2))
Example #12
0
# we can begin to understand matrix-vector products
A.shape, x.shape, np.dot(A, x)



############### 2.3.9. Matrix-Matrix Multiplication ###############
# if you have gotten the hang of dot products and matrix-vector products, then matrix-matrix multiplication should be straightforward.
B = np.ones(shape=(4, 3))
np.dot(A, B)



############### 2.3.10. Norms ###############
u = np.array([3, -4])
np.linalg.norm(u)
np.abs(u).sum()
np.linalg.norm(np.ones((4, 9)))



############### 2.3.10.1. Norms and Objectives ###############
# while we do not want to get too far ahead of ourselves, we can plant some intuition already about why these concepts are useful
# in deep learning, we are often trying to solve optimization provlems



############### 2.3.11. More on Linear Algebra ###############
# in just this section, we have taught you all the linear algebra that you will need to understand a remarkable chunk of modern deep learning


Example #13
0
def tv_loss(Y_hat):
    return 0.5 * (np.abs(Y_hat[:, :, 1:, :] - Y_hat[:, :, :-1, :]).mean() +
                  np.abs(Y_hat[:, :, :, 1:] - Y_hat[:, :, :, :-1]).mean())