def __call__(self, X): d_x = X.dim()[0][0] d_y = X.dim()[0][1] g = dy.ones((d_x, d_y)) b = dy.zeros((d_x, d_y)) Y = [] for attention in self.attention: Y.append(attention(X)) Y = dy.esum(Y) Y = dy.layer_norm(X + Y, g, b) Y = dy.layer_norm(Y + dy.transpose(self.feedforward(dy.transpose(Y))), g, b) return Y
def transduce(self, seq: ExpressionSequence) -> ExpressionSequence: seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0],), batch_size=d[0][1]*d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return ExpressionSequence(expr_tensor=seq_tensor)
def norm(x): """Layer Norm only handles a vector in dynet so fold extra dims into the batch.""" shape, batchsz = x.dim() first = shape[0] fold = np.prod(shape[1:]) x = dy.reshape(x, (first, ), batch_size=batchsz * fold) x = dy.layer_norm(x, a, b) return dy.reshape(x, shape, batch_size=batchsz)
def __call__(self, input_expr): g = dy.parameter(self.p_g) b = dy.parameter(self.p_b) (_, seq_len), batch_size = input_expr.dim() input = TimeDistributed()(input_expr) output = dy.layer_norm(input, g, b) return ReverseTimeDistributed()(output, seq_len, batch_size)
def norm(x): """Layer Norm only handles a vector in dynet so fold extra dims into the batch.""" shape, batchsz = x.dim() first = shape[0] fold = np.prod(shape[1:]) x = dy.reshape(x, (first,), batch_size=batchsz*fold) x = dy.layer_norm(x, a, b) return dy.reshape(x, shape, batch_size=batchsz)
def __call__(self, x): W = dy.parameter(self.W) b = dy.parameter(self.b) if self.ln: g = dy.parameter(self.g) y = dy.layer_norm(W * x, g, b) return self.act(y) else: y = dy.affine_transform([b, W, x]) return self.act(y)
def __call__(self, x): if self.ln: return self.activation( layer_norm( parameter(self.W) * x, parameter(self.ln_s), parameter(self.b))) else: return self.activation( affine_transform([parameter(self.b), parameter(self.W), x]))
def __call__(self, input, train_mode): for layer_idx in range(len(self.expressions)): layer = self.expressions[layer_idx] if layer_idx == 0: input = dy.layer_norm(input, layer[2], layer[3]) input = dy.affine_transform([layer[0], layer[1], input]) if layer_idx != len(self.expressions) - 1: input = self.act_fun(input) if train_mode: input = dy.dropout(input, self.dropout_rate) return input
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x, g, b) l = dy.sum_elems(y) l_value = l.scalar_value() l.backward() y_np_value = self.v2 / self.v1.std() * (self.v1 - self.v1.mean()) + self.v3 self.assertTrue(np.allclose(y.npvalue(), y_np_value))
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x,g,b) l = dy.sum_elems(y) l_value = l.scalar_value() l.backward() y_np_value = self.v2 / self.v1.std() * (self.v1 - self.v1.mean()) + self.v3 self.assertTrue(np.allclose(y.npvalue(),y_np_value))
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x, g, b) loss = dy.sum_elems(y) loss.backward() centered_v1 = self.v1 - self.v1.mean() y_np_value = self.v2 / self.v1.std() * centered_v1 + self.v3 self.assertTrue(np.allclose(y.npvalue(), y_np_value))
def __call__(self, obs, batched=False): out = obs if isinstance(obs, dy.Expression) else dy.inputTensor(obs, batched=batched) for i in range(self.n_layers): b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i]) out = dy.affine_transform([b, W, out]) if self.layer_norm and i != self.n_layers - 1: out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i]) if self.specified_activation: if self.activation[i] is not None: out = self.activation[i](out) else: out = self.activation(out) return out
def __call__(self, obs, batched=False): out = obs if isinstance(obs, dy.Expression) else dy.inputTensor( obs, batched=batched) for i in range(self.n_layers): b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i]) out = dy.affine_transform([b, W, out]) if self.layer_norm and i != self.n_layers - 1: out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i]) if self.specified_activation: if self.activation[i] is not None: out = self.activation[i](out) else: out = self.activation(out) return out
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = dy.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0], ), batch_size=d[0][1] * d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def transform(self, x: tt.Tensor) -> tt.Tensor: g = dy.parameter(self.p_g) b = dy.parameter(self.p_b) return dy.layer_norm(x, g, b)
def layer_norm(xs): head_shape, batch_size = xs[0].dim() g = dy.ones(head_shape) b = dy.zeros(head_shape) return [dy.layer_norm(x, g, b) for x in xs]
def transform(self, x: dy.Expression) -> dy.Expression: g = dy.parameter(self.p_g) b = dy.parameter(self.p_b) return dy.layer_norm(x, g, b)
def transform(self, x): g = self.p_g b = self.p_b return dy.layer_norm(x, g, b)
def __call__(self, x): g = dy.parameter(self.p_g) b = dy.parameter(self.p_b) return dy.layer_norm(x, g, b)
def transform(self, x): g = dy.parameter(self.p_g) b = dy.parameter(self.p_b) return dy.layer_norm(x, g, b)