def __init__(self, num_hiddens, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) # Create a long enough `P` self.P = np.zeros((1, max_len, num_hiddens)) X = np.arange(0, max_len).reshape(-1, 1) / np.power( 10000, np.arange(0, num_hiddens, 2) / num_hiddens) self.P[:, :, 0::2] = np.sin(X) self.P[:, :, 1::2] = np.cos(X)
def test_power(): A = np.full((2, INT_OVERFLOW), 2) B = np.ones((2, INT_OVERFLOW)) B[-1, -1] = 3 A.attach_grad() B.attach_grad() with mx.autograd.record(): C = np.power(A, B) C.backward() assert C.shape == A.shape assert C[-1, -1] == 8 assert A.grad.shape == A.shape assert A.grad[-1, -1] == 12 assert B.grad.shape == B.shape assert_almost_equal(B.grad[-1, -1], 2**3 * np.log(2), rtol=1e-5, atol=1e-5)
def get_positional_embeddings(length, depth) -> np.ndarray: utils.check_condition( depth % 2 == 0, "Positional embeddings require an even embedding size it " "is however %d." % depth) # (1, depth) channels = np.arange(depth // 2).reshape((1, -1)) # (length, 1) positions = np.arange(0, length).reshape((-1, 1)) scaled_positions = positions / np.power(10000, (2 * channels) / depth) # sinusoids: sin = np.sin(scaled_positions) # cosines: cos = np.cos(scaled_positions) # interleave: (length, num_embed) encodings = np.hstack([sin, cos]) return encodings
def forward(self, data): var = np.power(data, 2).mean(-1, keepdims=True) data = data * np.reciprocal(np.sqrt(var + self._epsilon)) return data * self.gamma.data() + self.beta.data()
def forward(self, x): var = np.power(x.astype('float32'), 2).mean(-1, keepdims=True) x = x * np.reciprocal(np.sqrt(var + self.variance_epsilon)) if self.gemma.dtype == 'float16': x = x.astype('float16') return self.gemma * x
import d2l_dx from mxnet import autograd, gluon, np, npx from mxnet.gluon import nn npx.set_np() maxdegree = 20 n_train, n_test = 100, 100 true_w = np.zeros(maxdegree) true_w[0:4] = np.array([5.0, 1.2, -3.4, 5.6]) features = np.random.normal(size=(n_train + n_test, 1)) features = np.random.shuffle(features) power = np.arange(maxdegree).reshape(1, -1) poly_features = np.power(features, power) poly_features = poly_features / (npx.gamma(np.arange(maxdegree) + 1).reshape( 1, -1)) labels = np.dot(poly_features, true_w) labels += np.random.normal(scale=0.1, size=labels.shape) print(poly_features.shape, true_w.shape, labels.shape) print("----------------------") print(poly_features[0], true_w) print("----------------------") print(features[0]) print("----------------------") print(poly_features[0]) print("----------------------") print(labels[0])
import math from mxnet import np, npx, gluon from mxnet.gluon import nn from d2l import mxnet as d2l npx.set_np() max_degree = 20 # Maximum degree of the polynomial n_train, n_test = 100, 100 # Training and test dataset sizes true_w = np.zeros(max_degree) # Allocate lots of empty space true_w[0:4] = np.array([5, 1.2, -3.4, 5.6]) features = np.random.normal(size=(n_train + n_test, 1)) np.random.shuffle(features) poly_features = np.power(features, np.arange(max_degree).reshape(1, -1)) for i in range(max_degree): poly_features[:, i] /= math.gamma(i + 1) # `gamma(n)` = (n-1)! # Shape of `labels`: (`n_train` + `n_test`,) labels = np.dot(poly_features, true_w) labels += np.random.normal(scale=0.1, size=labels.shape) def evaluate_loss(net, data_iter, loss): #@save """Evaluate the loss of a model on the given dataset.""" metric = d2l.Accumulator(2) # Sum of losses, no. of examples for X, y in data_iter: l = loss(net(X), y) metric.add(l.sum(), l.size) return metric[0] / metric[1]