def bilinear_kernel(in_channels, out_channels, kernel_size): factor = (kernel_size + 1) // 2 if kernel_size % 2 == 1: center = factor - 1 else: center = factor - 0.5 og = (np.arange(kernel_size).reshape(-1, 1), np.arange(kernel_size).reshape(1, -1)) filt = (1 - np.abs(og[0] - center) / factor) * \ (1 - np.abs(og[1] - center) / factor) weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size)) weight[range(in_channels), range(out_channels), :, :] = filt return np.array(weight)
def forward(self, X): X = self.dense(X) X = npx.relu(np.dot(X, self.rand_weight.data()) + 1) X = self.dense(X) while np.abs(X).sum() > 1: X /= 2 return X.sum()
def relative_position_bucket(relative_position, bidirectional: bool = True, num_buckets: int = 32, max_distance: int = 128): """Map the relative position to buckets. The implementation is consistent with that in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637) where relative position is defined as `mem_i - query_j`. Thus, a positive value indicates that the memory slot is in a later timestamp than the query slot. After handling the bidirectional case (see below), the implementation uses the first half of buckets to store exact differences and the second half to store the differences after a logrithmic transformation. Parameters ---------- relative_position Shape (...,) bidirectional Whether we are dealing with bidirectional attention. If it's bidirectional, positive shifts are mappd to [0, num_buckets // 2), and negative shifts are mapped to [num_buckets // 2, num_buckets). num_buckets The number of buckets. max_distance Maximum distance. Positions that fall outside of 'max_distance' will be trimmed. Returns ------- buckets Shape (...,). It has the same shape as the `relative_position`. It will have int32 type. """ ret = 0 relative_position = -relative_position if bidirectional: assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \ 'divisible by 2.' num_buckets //= 2 ret = ret + (relative_position < 0).astype(np.int32) * num_buckets relative_position = np.abs(relative_position) else: # Clip all the negative values to 0 relative_position = np.clip(relative_position, a_min=0, a_max=None) # Now, the relative_position is in the range [0, inf) # Half of the buckets deal with the exact increments, # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2 max_exact = num_buckets // 2 is_small = relative_position < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to # max_distance val_if_large = max_exact + ( np.log(relative_position.astype(np.float32) / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)).astype(np.int32) val_if_large = np.minimum(val_if_large, num_buckets - 1) ret = ret + np.where(is_small, relative_position, val_if_large) return ret
def test_abs(): A = np.ones((INT_OVERFLOW, 2)) A[0][0] = -1 A.attach_grad() with mx.autograd.record(): B = np.abs(A) assert B.shape == (INT_OVERFLOW, 2) assert B[0][0] == 1 B.backward() assert A.grad.shape == (INT_OVERFLOW, 2) assert A.grad[0][0] == -1
def test_abs(): # abs absolute and fabs are the same thing inp = np.zeros((INT_OVERFLOW, 2)) inp[-1, -1] = -1 inp.attach_grad() with mx.autograd.record(): out = np.abs(inp) out.backward() assert out.shape == (INT_OVERFLOW, 2) assert out[-1, -1] == 1 assert inp.grad.shape == (INT_OVERFLOW, 2) assert inp.grad[-1, -1] == -1
def test_contrib_intgemm_maxabsolute(shape): if "intgemm_maxabsolute" not in dir(mx.nd.contrib): return # mx.nd API m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape) fast = mx.nd.contrib.intgemm_maxabsolute(m) slow = mx.nd.max(mx.nd.abs(m)) assert same(fast, slow) # np API m = np.random.uniform(low=-100.0, high=100.0, size=shape) fast = npx.intgemm_maxabsolute(m).reshape(()) slow = np.max(np.abs(m)) assert same(fast, slow)
def train(lambd): # w and b has been 'grad attached' w, b = init_params() net, loss = lambda X: d2l_dx.linreg(X, w, b), d2l_dx.squared_loss num_epochs, lr = 100, 0.003 for epoch in range(1, num_epochs + 1): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) + lambd * l2_penalty(w) l.backward() d2l_dx.sgd([w, b], lr, batch_size) if epoch % 5 == 0: train_loss = d2l_dx.evaluate_loss(net, data_iter=train_iter, loss=loss) test_loss = d2l_dx.evaluate_loss(net, data_iter=test_iter, loss=loss) print("epochs: {}, training loss: {}, test loss: {}".format(epoch, train_loss, test_loss)) print('l1 norm of w', np.abs(w).sum())
def train_gluon(wd): net = gluon.nn.Sequential() net.add(gluon.nn.Dense(1)) net.initialize(init.Normal(sigma=1)) loss = gluon.loss.L2Loss() num_epochs, lr = 100, 0.003 trainer_w = gluon.Trainer(net.collect_params('.*weight'), 'sgd', {'learning_rate': lr, 'wd': wd}) trainer_b = gluon.Trainer(net.collect_params('.*bias'), 'sgd', {'learning_rate': lr}) for epoch in range(1, num_epochs + 1): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer_w.step(batch_size) trainer_b.step(batch_size) if epoch % 5 == 0: train_loss = d2l_dx.evaluate_loss(net, data_iter=train_iter, loss=loss) test_loss = d2l_dx.evaluate_loss(net, data_iter=test_iter, loss=loss) print("epochs: {}, training loss: {}, test loss: {}".format(epoch, train_loss, test_loss)) print('L1 norm of w: ', np.abs(net[0].weight.data()).sum())
def norms(): u = np.array([3, -4]) # l_2 (euclidean) np.linalg.norm(u) # l_1 np.abs(u).sum()
def bbox_eval(bbox_preds, bbox_labels, bbox_masks): return float((np.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())
def corr2d_multi_in_out_1x1(X, K): c_i, h, w = X.shape c_o = K.shape[0] X = X.reshape((c_i, h * w)) K = K.reshape((c_o, c_i)) Y = np.dot(K, X) # Matrix multiplication in the fully-connected layer return Y.reshape((c_o, h, w)) X = np.random.normal(0, 1, (3, 3, 3)) K = np.random.normal(0, 1, (2, 3, 1, 1)) Y1 = corr2d_multi_in_out_1x1(X, K) Y2 = corr2d_multi_in_out(X, K) assert float(np.abs(Y1 - Y2).sum()) < 1e-6 # Exercise 6 def corr2d_multi_in_out_2x2(X, K): c_i, h, w = X.shape c_o, c_ii, kh, kw = K.shape assert c_ii == c_i, "Kernel channel dimensions don't match input" X = X.reshape((c_i, h * w)) K = K.reshape((c_o, kh * kw, c_i)) Y = np.dot(K, X) # Matrix multiplication in the fully-connected layer Y = Y.sum(axis=2) # input channel dimension return Y.reshape((c_o, kh, kw)) K = np.random.normal(0, 1, (2, 3, 2, 2))
# we can begin to understand matrix-vector products A.shape, x.shape, np.dot(A, x) ############### 2.3.9. Matrix-Matrix Multiplication ############### # if you have gotten the hang of dot products and matrix-vector products, then matrix-matrix multiplication should be straightforward. B = np.ones(shape=(4, 3)) np.dot(A, B) ############### 2.3.10. Norms ############### u = np.array([3, -4]) np.linalg.norm(u) np.abs(u).sum() np.linalg.norm(np.ones((4, 9))) ############### 2.3.10.1. Norms and Objectives ############### # while we do not want to get too far ahead of ourselves, we can plant some intuition already about why these concepts are useful # in deep learning, we are often trying to solve optimization provlems ############### 2.3.11. More on Linear Algebra ############### # in just this section, we have taught you all the linear algebra that you will need to understand a remarkable chunk of modern deep learning
def tv_loss(Y_hat): return 0.5 * (np.abs(Y_hat[:, :, 1:, :] - Y_hat[:, :, :-1, :]).mean() + np.abs(Y_hat[:, :, :, 1:] - Y_hat[:, :, :, :-1]).mean())