def lstm(_inputs, initial_state_h, initial_state_c, *parameters): # _inputs: a list with length num_steps, # corresponding element: batch_size * input_dim matrix H = initial_state_h # hidden state C = initial_state_c # memory cell [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hy, b_y] = parameters _outputs = [] for X in _inputs: # compute INPUT gate from input and last/initial hidden state input_gate = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) # compute FORGET gate from input and last/initial hidden state forget_gate = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) # compute OUTPUT gate from input and last/initial hidden state output_gate = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) # compute memory cell candidate from input and last/initial hidden state memory_cell_candidate = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) # compute memory cell from last memory cell and memory cell candidate C = forget_gate * C + input_gate * memory_cell_candidate # compute hidden state from output gate and memory cell H = output_gate * nd.tanh(C) # compute output from hidden state Y = nd.dot(H, W_hy) + b_y _outputs.append(Y) return _outputs, H, C
def lstm(inputs, state, params): # inputs和outputs皆为num_steps个形状为(batch_size, vocab_size)的矩阵 [ W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_xi2, W_hi2, b_i2, W_xf2, W_hf2, b_f2, W_xo2, W_ho2, b_o2, W_xc2, W_hc2, b_c2, W_hq, b_q ] = params (H, C) = state outputs = [] for X in inputs: I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) C1 = F * C + I * C_tilda H1 = C.tanh() * O I2 = nd.sigmoid(nd.dot(H, W_xi2) + nd.dot(H1, W_hi2) + b_i2) F2 = nd.sigmoid(nd.dot(H, W_xf2) + nd.dot(H1, W_hf2) + b_f2) O2 = nd.sigmoid(nd.dot(H, W_xo2) + nd.dot(H1, W_ho2) + b_o) C_tilda = nd.tanh(nd.dot(H, W_xc2) + nd.dot(H1, W_hc2) + b_c2) C2 = F2 * C1 + I2 * C_tilda H2 = C2.tanh() * O2 Y = nd.dot(H2, W_hq) + b_q outputs.append(Y) return outputs, (H2, C2)
def nodeforward(self, x, cs, hs, ctx): x = nd.reshape(x, (self.dim_h, )) _Ui = nd.zeros((self.dim_h, ), ctx=ctx) _Uo = nd.zeros((self.dim_h, ), ctx=ctx) _Uu = nd.zeros((self.dim_h, ), ctx=ctx) _Uf = [nd.zeros((self.dim_h, ), ctx=ctx) for i in range(len(cs))] for idx in range(len(cs)): _Ui = nd.add(_Ui, nd.dot(self.Uis[idx].data(), hs[idx])) _Uo = nd.add(_Uo, nd.dot(self.Uos[idx].data(), hs[idx])) _Uu = nd.add(_Uu, nd.dot(self.Uus[idx].data(), hs[idx])) for j in range(len(cs)): _Uf[idx] = nd.add(_Uf[idx], nd.dot(self.Ufs[idx][j].data(), hs[j])) i = nd.sigmoid( nd.add(nd.add(nd.dot(self.Wi.data(), x), _Ui), self.bi.data())) o = nd.sigmoid( nd.add(nd.add(nd.dot(self.Wo.data(), x), _Uo), self.bo.data())) f = [ nd.sigmoid( nd.add(nd.add(nd.dot(self.Wf.data(), x), _Uf[idx]), self.bf.data())) for idx in range(len(cs)) ] u = nd.tanh( nd.add(nd.add(nd.dot(self.Wu.data(), x), _Uu), self.bu.data())) c = nd.zeros((self.dim_h, ), ctx=ctx) for idx in range(len(cs)): c = nd.add(c, nd.multiply(f[idx], cs[idx])) c = nd.add(nd.multiply(i, u), c) h = nd.multiply(o, nd.tanh(c)) return c, h
def forward(self, input_data): freq = input_data[:, 0:2].expand_dims(1) input_data = input_data[:, 2:] e1_vec_start = FIXED_WORD_LENGTH * DIMENSION x = input_data[:, :e1_vec_start].reshape( (input_data.shape[0], FIXED_WORD_LENGTH, DIMENSION)) # (m, 60, 110) e1neimask = input_data[:, e1_vec_start:e1_vec_start + MASK_LENGTH] # (m, 51) e1edge = input_data[:, e1_vec_start + MASK_LENGTH:e1_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH].reshape( (input_data.shape[0], ENTITY_DEGREE, WORD_DIMENSION * 2)) # (m, 51, 200) e1neigh = e1edge[:, :, :WORD_DIMENSION] e2_vec_start = e1_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH e2neimask = input_data[:, e2_vec_start:e2_vec_start + MASK_LENGTH] # (m, 51) e2edge = input_data[:, e2_vec_start + MASK_LENGTH:e2_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH].reshape( (input_data.shape[0], ENTITY_DEGREE, WORD_DIMENSION * 2)) # (m, 51,200) e2neigh = e2edge[:, :, :WORD_DIMENSION] gru = self.gru x = nd.transpose(x, axes=(1, 0, 2)) h = gru(x) ht = nd.transpose(h, axes=(1, 0, 2)) gru_out = self.gru_out y1 = gru_out(ht.expand_dims(1)) # (m,200) att = self.center_att e1edge = nd.tanh(e1edge) e1g = att(e1edge) * freq[:, :, :1] # (m,51,1) e1g = e1g * e1neimask.expand_dims(2) e1g = nd.softmax(e1g, axis=1) e1gt = nd.transpose(e1g, axes=(0, 2, 1)) # (m,1,151) e1n = nd.batch_dot(e1gt, e1neigh) # (m,1,100) e1n = e1n.reshape((e1n.shape[0], 100)) # (m,100) e2edge = nd.tanh(e2edge) e2g = att(e2edge) * freq[:, :, 1:] # (m,51,1) e2g = e2g * e2neimask.expand_dims(2) e2g = nd.softmax(e2g, axis=1) e2gt = nd.transpose(e2g, axes=(0, 2, 1)) # (m,1,151) e2n = nd.batch_dot(e2gt, e2neigh) # (m,1,100) e2n = e2n.reshape((e2n.shape[0], 100)) # (m,100) center_y = nd.concat(e1n, e2n, dim=1) # (m,200) center_out = self.center_out center_y = center_out(center_y) out = self.output y4 = nd.concat(y1, center_y, dim=1) y5 = out(y4) return y5
def forward(self, x): if self.dependent_G: g = nd.sigmoid(nd.dot(x, self.G.data())) else: g = nd.sigmoid(self.G.data()) W0 = nd.tanh(self.W0_hat.data()) * nd.sigmoid(self.M0_hat.data()) W1 = nd.tanh(self.W1_hat.data()) * nd.sigmoid(self.M1_hat.data()) a = nd.dot(x, W0) m = nd.exp(nd.dot(nd.log(nd.abs(x) + 1e-10), W1)) y = g * a + (1 - g) * m return y
def lstm(inputs, state, params): W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q = params H, C = state outputs = [] for X in inputs: I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) C_ = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) C = F * C + I * C_ H = O * nd.tanh(C) Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H, C)
def squash_policy(mu, pi, logp_pi): def clip_pass_gradient(x, l=-1., u=1.): clip_up = nd.cast(x > u, "float32") clip_low = nd.cast(x < l, "float32") return x + nd.stop_gradient((u - x) * clip_up + (l - x) * clip_low) mu = nd.tanh(mu) pi = nd.tanh(pi) # avoid machine precision error, clip 1-pi**2 to [0, 1] logp_pi = logp_pi - nd.sum( nd.log(clip_pass_gradient(1 - nd.square(pi), l=0, u=1) + 1e-6), axis=1) return mu, pi, logp_pi
def gru(_inputs, initial_state, *parameters): # _inputs: a list with length num_steps, # corresponding element: batch_size * input_dim matrix H = initial_state [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hy, b_y] = parameters _outputs = [] for X in _inputs: # compute update gate from input and last/initial hidden state update_gate = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z) # compute reset gate from input and last/initial hidden state reset_gate = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r) # compute candidate hidden state from input, reset gate and last/initial hidden state H_candidate = nd.tanh(nd.dot(X, W_xh) + reset_gate * nd.dot(H, W_hh) + b_h) # compute hidden state from candidate hidden state and last hidden state H = update_gate * H + (1 - update_gate) * H_candidate # compute output from hidden state Y = nd.dot(H, W_hy) + b_y _outputs.append(Y) return _outputs, H
def rnn(inputs, h, w_xh, w_hh, b_h, w_hy, b_y): output = [] for x in inputs: h = nd.tanh(nd.dot(x, w_xh) + nd.dot(h, w_hh) + b_h) y = nd.dot(h, w_hy) + b_y output.append(y) return (output, h)
def forward_single(self, feature, data, begin_state): """ unroll one step Parameters ---------- feature: a NDArray with shape [n, d]. data: a NDArray with shape [n, b, d]. begin_state: a NDArray with shape [n, b, d]. Returns ------- output: ouptut of the cell, which is a NDArray with shape [n, b, d] states: a list of hidden states (list of hidden units with shape [n, b, d]) of RNNs. """ if begin_state is None: num_nodes, batch_size, _ = data.shape begin_state = [nd.zeros((num_nodes, batch_size, self.hidden_size), ctx=feature.context)] prev_state = begin_state[0] data_and_state = nd.concat(data, prev_state, dim=-1) z = nd.sigmoid(self.dense_z(feature, data_and_state)) r = nd.sigmoid(self.dense_r(feature, data_and_state)) state = z * prev_state + (1 - z) * nd.tanh(self.dense_i2h(feature, data) + self.dense_h2h(feature, r * prev_state)) return state, [state]
def forward(self, x): x = nd.pick(x, nd.broadcast_to(self._dim.data(), x.shape[0]), keepdims=True) x -= self._split.data() x *= nd.relu(self._sharpness.data()) return nd.tanh(x)
def forward(self, query, values, head=False): """ 计算Attention权重与输出向量 :param query: 查询,即当前步Decoder的输入 :param values: 值,即Encoder中每一个时间步向量 :return: (Attention输出向量, Attention权重) """ #print('In Attention') hidden_with_time_axis = nd.expand_dims(query, 1) #print('hidden_with_time:', hidden_with_time_axis.shape) score = self.V( nd.tanh(self.W1(values) + self.W2(hidden_with_time_axis))) #print('\t score:',score.shape) attention_weights = nd.softmax(score, axis=1) #print('\t attention_weight:', attention_weights.shape) #print('\t values:', values.shape) context_vector = attention_weights * values #print('\t mid_context_vector:',context_vector.shape) if head is True: context_vector = nd.sum(context_vector, axis=2) else: context_vector = nd.sum(context_vector, axis=1) # print('\t context',context_vector.shape) context_vector = nd.expand_dims(context_vector, axis=0) return context_vector, attention_weights
def forward(self, current, previous, doc_encode): """[summary] Args: current ([type]): h_j (batch_size, sentence_hidden_size * 2) previous ([type]): s_j (batch_size, sentence_hidden_size * 2) doc_encode ([type]): d (batch_size, ndoc_dims) """ # content: (batch_size, 1) content = self.content_encoder(current) # salience: (batch_size, sentence_hidden_size * 2) salience = self.salience_encoder(doc_encode) salience = current * salience # salience: (batch_size,) salience = nd.sum_axis(salience, -1) # salience: (batch_size, 1) salience = nd.expand_dims(salience, -1) # novelty: (bathc_size, sentence_hidden_size * 2) novelty = self.novelty_encoder(nd.tanh(previous)) novelty = current * novelty # salience: (batch_size,) novelty = nd.sum_axis(novelty, -1) # salience: (batch_size, 1) novelty = nd.expand_dims(novelty, -1) # P: (batch_size, 1) P = nd.sigmoid(content + salience - novelty) return P
def lstm_rnn(inputs, state_h, state_c, *params): ''' :param inputs: 输入 :param state_h: 上一时刻的输出 :param state_c: 上一时刻的状态 :param params: 参数对 :return: 输出 输入门:It=σ(Xt*Wxi+Ht−1*Whi+bi) 遗忘门:Ft=σ(Xt*Wxf+Ht−1*Whf+bf) 输出门:Ot=σ(Xt*Wxo+Ht−1*Who+bo) 输入状态:I_state=tanh(Xt*Wxc+Ht−1*Whc+bc) 输出:Y=Why*Ht-1+by ''' [ W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hy, b_y ] = params H = state_h # 与输入组成一个输入门状态,控制有多少新输入补充到最新记忆里 C = state_c # 记录这一时刻的状态,传给下一时刻 outputs = [] for X in inputs: I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) # 输入门,就是 C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) # 输入门状态用来控制有多少输入信息补充到最新的记忆 F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) # 遗忘门,控制上一刻有多少信息被遗忘 O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) # 输出门 C = F * C + C_tilda * I # 更新作为下一刻的状态 H = O * C.tanh() Y = nd.dot(H, W_hy) + b_y # 这一刻的输出作为下一刻的输入 outputs.append(Y) return (outputs, H, C)
def rnn(inputs, state, params): W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H, )
def lstm(self, inputs, state): [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q] = self.params (H, C) = state outputs = [] for X in inputs: I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) C = I * C_tilda + F * C H = nd.tanh(C) * O Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H, C)
def rnn(inputs, state, *params): H = state W_xh, W_hh, b_h, W_hy, b_y = params outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hy) + b_y outputs.append(Y) return outputs, H
def forward(self, x, crisp=False): pick_index = nd.broadcast_to(self._dim.data(), x.shape[0]) x = nd.pick(x, pick_index, keepdims=True) x = x - self._split.data() if (crisp == False): x = x * nd.relu(self._sharpness.data()) * self._gate() # x = x * nd.relu(self._sharpness.data()) return nd.tanh(x)
def rnn(inputs, H, W_xh, W_hh, b_h, W_hy, b_y): # inputs: num_steps个尺寸为batch_size*vocab_size矩阵 # H: 尺寸为batch_size * hidden_size矩阵 # outputs: num_steps个尺寸为batch_size*vocab_size矩阵 outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hy) + b_y outputs.append(Y) return outputs, H
def rnn(inputs, state, params): # inputs和outputs皆为num_steps(时间步数)个形状为(batch_size, vocab_size)的矩阵 W_xh, W_hh, b_h, W_hq, b_q = params H, = state #H初始状态 outputs = [] for X in inputs: #每批单个字,批量输入 H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) # 更新H,每个H都是本次输入和上次H相加 Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H,) # 返回时间步矩阵,和最后一个H
def lstm_rnn(inputs, h, c, temperature=1.0): outputs = [] for X in inputs: g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg) i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi) f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf) o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo) ####################### # ####################### c = f * c + i * g h = o * nd.tanh(c) ####################### # ####################### yhat_linear = nd.dot(h, Why) + by yhat = softmax(yhat_linear, temperature=temperature) outputs.append(yhat) return (outputs, h, c)
def forward(self, input_data): x = nd.transpose(input_data, axes=(1, 0, 2)) h = nd.transpose(self.gru(x), axes=(1, 0, 2)) # (m,60,100) h = nd.tanh(h) g = self.att(h) # (m,60,1) g = nd.softmax(g, axis=1) gt = nd.transpose(g, axes=(0, 2, 1)) # (m,1,60) n = nd.batch_dot(gt, h) y = self.att_out(n) return self.output(y)
def rnn(inputs, state, params): # inputs和output都是num_steps个形状为(batch_size,vocab_size) output = [] W_xh, W_hh, b_h, W_hq, b_q = params H, = state # 只有第一个有参数H,见上,shape=(batch_size, num_hiddens) for X in inputs: # 遍历num_steps个 H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) # 计算隐藏状态,并作为返回值保存 Y = nd.dot(H, W_hq) + b_q output.append(Y) # 追加 return output, (H, )
def rnn(inputs, state, params): # inputs和outputs皆为num_steps个形状为(batch_size, vocab_size)的矩阵 W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H,)
def rnn(inputs, state, params): w_xh, w_hh, b_h, w_ho, b_q = params H, = state outputs = [] # inputs和outputs均为num_steps个(batch_size, vocab_size)的矩阵 for X in inputs: H = nd.tanh(nd.dot(X, w_xh) + nd.dot(H, w_hh) + b_h) Y = nd.dot(H, w_ho) + b_q outputs.append(Y) return outputs, (H,)
def simple_rnn(inputs, state, temperature=1.0): outputs = [] h = state for X in inputs: h_linear = nd.dot(X, Wxh) + nd.dot(h, Whh) + bh h = nd.tanh(h_linear) yhat_linear = nd.dot(h, Why) + by yhat = softmax(yhat_linear, temperature=temperature) outputs.append(yhat) return (outputs, h)
def rnn(inputs, state, params): # inputs and outputs are num_step (batchsize, vacasize) # Use tanh as activation function W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H, )
def rnn(self): W_xh, W_hh, b_h, W_hq, b_q = self.params H, = self.state outputs = [] for X in self.inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H,)
def gru(inputs, state, params): W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params H = state outputs = [] for X in inputs: Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z) R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r) H_ = nd.tanh(nd.dot(X, W_xh) + R * nd.dot(H, W_hh) + b_h) H = Z * H + (1 - Z) * H_ Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, H
def rnn(inputs, state, *params): # inputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵。 # H: 尺寸为 batch_size * hidden_dim 矩阵。 # outputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵。 H = state W_xh, W_hh, b_h, W_hy, b_y = params outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hy) + b_y outputs.append(Y) return (outputs, H)
def check_tanh(): x = create_input_for_trigonometric_ops( [-1 / 4, -1 / 2, 0, 1 / 4, 1 / 2]) y = nd.tanh(x) # expected ouput for indices=(0, 1, -3, -2, -1) after applying tanh() expected_output = [ np.tanh(-1 / 4), np.tanh(-1 / 2), 0, np.tanh(1 / 4), np.tanh(1 / 2) ] assert_correctness_of_trigonometric_ops(y, expected_output)
def rnn(inputs, state, params): W_xh, W_hh, b_h, W_hq, b_q = params H, = state #print(H.shape) # 中间状态的维度 #print(len(inputs), len(inputs[0]), len(inputs[0][0]),len(inputs[0][0][0])) # 样本被切分用于多次训练 outputs = [] for X in inputs: #print(X.shape) #每次通过网络的样本个数 H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H, )
def gru_rnn(inputs, h, temperature=1.0): outputs = [] for X in inputs: z = nd.sigmoid(nd.dot(X, Wxz) + nd.dot(h, Whz) + bz) r = nd.sigmoid(nd.dot(X, Wxr) + nd.dot(h, Whr) + br) g = nd.tanh(nd.dot(X, Wxh) + nd.dot(r * h, Whh) + bh) h = z * h + (1 - z) * g yhat_linear = nd.dot(h, Why) + by yhat = softmax(yhat_linear, temperature=temperature) outputs.append(yhat) return (outputs, h)
def forward(self, x): return nd.tanh(x)
def forward(self, x): y = nd.tanh(x) self.save_for_backward(x,y) return y