def rnn(inputs, hidden_states, params): ''' inputs shape: (num_steps[seq-len],batch_size,vocab_size) return: outputs,(H,) ''' W_xh, W_hh, b_h, W_ho, b_o = params H, = hidden_states outputs = [] hidden_states = [] # X shape: (batch_size,vocab_size) print( f"rnn loops {inputs.shape[0]} times along seq_length axis---------\n") i = 1 for X in inputs: # 沿着num_steps(sequence length)循环 print(f"loops {i} times \n") i += 1 H = mxnp.dot(X, W_xh) + mxnp.dot(H, W_hh) + b_h H = mxnp.tanh(H) hidden_states.append(H) print( f"---rnn input(X,H) and weights' shape---------\n" f" ---X.shape={X.shape},W_xh.shape={W_xh.shape}\n" f" ---H.shape={H.shape},W_hh.shape={W_hh.shape},b_h.shape={b_h.shape}\n" f" ---W_ho.shape={W_ho.shape},b_o.shape={b_o.shape}\n") Y = mxnp.dot(H, W_ho) + b_o outputs.append(Y) print(f"---rnn output's shape---------\n" f" ---Y.shape={Y.shape},H.shape={H.shape}\n") Ys = mxnp.concatenate(outputs, axis=0) print(f"Final Ys.shape={Ys.shape}") return Ys, (H, ), hidden_states, outputs
def forward(self, x): if self._mode == 'erf': return npx.leaky_relu(x, act_type='gelu') elif self._mode == 'tanh': return 0.5 * x\ * (1.0 + np.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * (x ** 3)))) elif self._mode == 'sigmoid': return x * npx.sigmoid(1.702 * x) else: raise NotImplementedError
def forward(self, queries, keys, values, valid_lens): queries, keys = self.W_q(queries), self.W_k(keys) # After dimension expansion, shape of `queries`: (`batch_size`, no. of # queries, 1, `num_hiddens`) and shape of `keys`: (`batch_size`, 1, # no. of key-value pairs, `num_hiddens`). Sum them up with # broadcasting features = np.expand_dims(queries, axis=2) + np.expand_dims( keys, axis=1) features = np.tanh(features) # There is only one output of `self.w_v`, so we remove the last # one-dimensional entry from the shape. Shape of `scores`: # (`batch_size`, no. of queries, no. of key-value pairs) scores = np.squeeze(self.w_v(features), axis=-1) self.attention_weights = masked_softmax(scores, valid_lens) # Shape of `values`: (`batch_size`, no. of key-value pairs, value # dimension) return npx.batch_dot(self.dropout(self.attention_weights), values)
def forward(self, X, state): w_ih, w_ir, w_iu, w_hh, w_hr, w_hu, b_h, b_r, b_u, w_ho, b_o = self.params state = state[0] outputs = [] for x in X: r = npx.sigmoid(x @ w_ir + state @ w_hr + b_r) # reset gate权重 u = npx.sigmoid(x @ w_iu + state @ w_hu + b_u) # update gate权重 hr = state * r # hidden state resets h_tilda = mxnp.tanh(x @ w_ih + hr @ w_hh + b_h) # 输入x 重置后的隐藏状态hr 乘以各自对应的权重矩阵,得到候选隐藏状态 state = state * u + h_tilda * (1 - u) # 更新【隐藏状态 候选隐藏状态】 y = state @ w_ho + b_o # 计算输出 outputs.append(y) return mxnp.concatenate(outputs, axis=0), (state, )
#Plot ReLu function x = np.arange(-8.0, 8.0, 0.1) x.attach_grad() with autograd.record(): y = npx.relu(x) d2l.plot(x, y, 'x', 'relu(x)', figsize = (5, 2.5)) y.backward() d2l.plot(x, x.grad, 'x', 'grad of relu', figsize = (5, 2.5)) #Plot Sigmoid function with autograd.record(): y = npx.sigmoid(x) d2l.plot(x, y, 'x', 'sigmoid(x)', figsize = (5, 2.5)) y.backward() d2l.plot(x, x.grad, 'x', 'grad of sigmoid', figsize = (5, 2.5)) #Plot tanh function with autograd.record(): y = np.tanh(x) #npx doesnt have tanh function d2l.plot(x, y, 'x', 'tanh(x)', figsize = (5, 2.5)) y.backward() d2l.plot(x, x.grad, 'x', 'grad of tanh', figsize = (5, 2.5)) #Calculate the derivative of the pReLU activation function #with autograd.record(): # y = npx.relu(x) + 0.01 * min(0, x) #d2l.plot(x, y, 'x', 'prelu(x)', figsize = (5, 2.5)) #y.backward() #d2l.plot(x, x.grad, 'x', 'grad of relu', figsize = (5, 2.5))
W1 = np.array([[0.9, 0.3], [-0.7, 0.3]]) W2 = np.array([-0.3, -0.9]) b1 = np.array([0.9,-0.7]) b2 = np.array([-0.7]) params = [W1, b1, W2, b2] for param in params: param.attach_grad() X = np.array([[1, 1], [0, 1], [0, 0], [1, 0]]) X.attach_grad() y_true = np.array([1, -1, 1, -1]) for i in range(len(y_true)): with autograd.record(): H = np.tanh(np.dot(W1, X[i]) + b1) O = np.tanh(np.dot(W2, H) + b2) L = (y_true[i] - O) ** 2 L = 1/2 * L L.backward() W1 -= W1.grad W2 -= W2.grad b1 -= b1.grad b2 -= b2.grad print('iteration:', i+1) print('true label', y_true[i]) print('input', X[i]) print('predicted label:', O) print('updated W1', W1) print('updated b1', b1) print('updated W2', W2)
import math from mxnet import np, npx, gluon, autograd from mxnet.gluon import nn from d2l import mxnet as d2l npx.set_np() #とりあえずNo1だけについて W1 = np.array([[0.9, 0.3, 0.9], [-0.7, 0.3, -0.7]]) W2 = np.array([-0.3, -0.9, -0.7]) b1 = np.array([1]) b2 = np.array([1]) params = [W1, b1, W2, b2] for param in params: param.attach_grad() X = np.array([1, 1, 1]) X.attach_grad() y_true = np.array([1]) with autograd.record(): H = np.tanh(np.dot(W1, X)) O = np.tanh(np.dot(W2, np.append(H, np.array([1])))) L = (y_true - O)**2 L = 1 / 2 * L L.backward() print('predicted value:', O) print('updated W1', W1 - W1.grad) print('updated W2', W2 - W2.grad)
import math from mxnet import np, npx, gluon, autograd from mxnet.gluon import nn from d2l import mxnet as d2l npx.set_np() initial_w1 = np.array([[0.9, 0.3, 0.9], [-0.7, 0.3, -0.7]]) initial_w2 = np.array([-0.3, -0.9, -0.7]) true_labels = np.array([1, -1, 1, -1]) inputs = np.array([[1, 1, 1], [0, 1, 1], [0, 0, 1], [1, 0, 1]]) #No1についての予測, 以下No. n のm層目に関してh_n_m などと記述 h_1_1 = np.dot(initial_w1, inputs[0]) z_1_1 = np.append(np.tanh(h1), np.array([1])) z_1_3 = np.dot(initial_w2, z_1_1) y_hat_1 = np.tanh(z_1_3) y_hat_1 #No1についてのロス