def lstm_without_dropout(n_layer, dropout, hx, cx, ws, bs, xs): xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs = [xs[i] for i in range(3)] ys = [] for x in xs: cx_next = [] hx_next = [] for layer in range(n_layer): c = cx[layer] h = hx[layer] if layer != 0: # Only multiply ratio x = x * (1 / (1.0 - dropout)) lstm_in = functions.linear(x, xws[layer], xbs[layer]) + \ functions.linear(h, hws[layer], hbs[layer]) c_new, h_new = functions.lstm(c, lstm_in) cx_next.append(c_new) hx_next.append(h_new) x = h_new cx = cx_next hx = hx_next ys.append(x) cy = functions.stack(cx) hy = functions.stack(hx) return hy, cy, ys
def __call__(self, x, W=None, b=None): """ Perform the Linear operation with custom weights and bias Args: x (float[][]): input tensor "x" to transform W (float[][]): input weights b (float[]): input bias Returns float[][] """ if W is None and b is None and self.W.data is None: if self.in_size is None: self._initialize_params(x.size // x.shape[0]) else: self._initialize_params(self.in_size) if W is None: W = self.W if b is None: b = self.b if not self.no_bias: return F.linear(x, W, b) else: return F.linear(x, W)
def _call_1step(net: NStepRNNBase, hidden: ArrayLike, input: ArrayLike): if hidden is None: hidden = net.init_hx(input)[0] x = input h = hidden w = net.ws[0] b = net.bs[0] xw = F.concat([w[0], w[1], w[2]], axis=0) hw = F.concat([w[3], w[4], w[5]], axis=0) xb = F.concat([b[0], b[1], b[2]], axis=0) hb = F.concat([b[3], b[4], b[5]], axis=0) gru_x = F.linear(x, xw, xb) gru_h = F.linear(h, hw, hb) W_r_x, W_z_x, W_x = F.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = F.split_axis(gru_h, 3, axis=1) r = F.sigmoid(W_r_x + U_r_h) z = F.sigmoid(W_z_x + U_z_h) h_bar = F.tanh(W_x + r * U_x) h = F.linear_interpolate(z, hidden, h_bar) return h
def forward(self, *inputs): if len(inputs) == 3: x, W, b = inputs y = functions.linear(x, W, b) else: x, W = inputs y = functions.linear(x, W) return y,
def forward(self, *inputs): if len(inputs) == 3: x, W, b = inputs y = functions.linear(x, W, b, n_batch_axes=self.n_batch_axes) else: x, W = inputs y = functions.linear(x, W, n_batch_axes=self.n_batch_axes) return y,
def check_forward(self, x_data, W_data, b_data, y_expect): x = chainer.Variable(x_data) W = chainer.Variable(W_data) if b_data is None: y = functions.linear(x, W) else: b = chainer.Variable(b_data) y = functions.linear(x, W, b) gradient_check.assert_allclose(y_expect, y.data)
def check_forward(self, x_data, W_data, b_data, y_expect): x = chainer.Variable(x_data) W = chainer.Variable(W_data) if b_data is None: y = functions.linear(x, W) else: b = chainer.Variable(b_data) y = functions.linear(x, W, b) self.assertEqual(y.data.dtype, self.x_dtype) testing.assert_allclose(y_expect, y.data, **self.check_forward_options)
def check_forward(self, x_data, W_data, b_data, y_expect): x = chainer.Variable(x_data) W = chainer.Variable(W_data) if b_data is None: y = functions.linear(x, W) else: b = chainer.Variable(b_data) y = functions.linear(x, W, b) self.assertEqual(y.data.dtype, self.x_dtype) testing.assert_allclose( y_expect, y.data, **self.check_forward_options)
def error_and_accuracy(w_1, w_2, b_1, b_2, x_data, t_data): x = Variable(x_data) t = Variable(t_data) a_z = F.linear(x, w_1, b_1) z = F.tanh(a_z) a_y = F.linear(z, w_2, b_2) error = F.softmax_cross_entropy(a_y, t) accuracy = F.accuracy(a_y, t) return error.data, accuracy.data * 100
def lstm(self, h, x): lstm = self.lang_model.lstm[0] a = F.linear(x, lstm.w2, lstm.b2) + F.linear(h, lstm.w6, lstm.b6) i = F.linear(x, lstm.w0, lstm.b0) + F.linear(h, lstm.w4, lstm.b4) f = F.linear(x, lstm.w1, lstm.b1) + F.linear(h, lstm.w5, lstm.b5) o = F.linear(x, lstm.w3, lstm.b3) + F.linear(h, lstm.w7, lstm.b7) return a, i, f, o
def _step_rnn_tanh(rnn, x, state): assert isinstance(rnn, L.NStepRNNTanh) assert len(rnn.ws) == 1 assert len(rnn.bs) == 1 assert len(rnn.ws[0]) == 2 assert len(rnn.bs[0]) == 2 if state is None: xp = rnn.xp h = xp.zeros((len(x), rnn.out_size), dtype=np.float32) else: h = state w0, w1 = rnn.ws[0] b0, b1 = rnn.bs[0] h = F.tanh(F.linear(x, w0, b0) + F.linear(h, w1, b1)) return h, h
def forward_batch(self, x1, x2): xp = cuda.get_array_module(x1.data) batch, slen, hidden = x2.shape return F.batch_matmul( F.concat([x1, xp.ones((batch, slen, 1), 'f')], 2), # (batch, slen, hidden+1) F.reshape(F.linear(F.reshape(x2, (batch * slen, -1)), self.W), (batch, slen, -1)), transb=True)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g = F.broadcast_to( F.gaussian( np.array([0], dtype=np.float32), np.array([np.exp(1)], dtype=np.float32)), x.shape) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) u = x_g_norm - 2 * x_g_y_g+ y_g_norm print(np.min(u.data)) print(len((np.where(u.data < 0)[0])), np.prod(u.data.shape)) time.sleep(0.5) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def _forward(self, x, q, drop=False): bs = len(x.data) nl = self.num_layers l = None if self.use_position_encoding: l = self._position_encoding(q) u = self._embedding(self.B, q, l) if self.use_position_encoding: l = self._position_encoding(x) for i in range(nl): u = self._attention(u, x, l, i) u = self.double(u) us = F.split_axis(u, 2, axis=1) xs = [F.linear(u, self.W.W) for u in us] # xs: [batch x vocab, batch x vocab] # xs = [F.softmax(x) for x in xs] preds = [[] for _ in range(bs)] ids = [F.argmax(x, axis=1) for x in xs] # ids: [batch x 1, batch x 1] for i in range(2): for j in range(bs): token = self.vec2txt([ids[i][j].data]) preds[j].append(token) return xs, preds
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) u = x_g_norm - 2 * x_g_y_g + y_g_norm print(np.min(u.data)) print(len((np.where(u.data < 0)[0])), np.prod(u.data.shape)) time.sleep(0.5) return F.exp(-x_g_norm + 2 * x_g_y_g - y_g_norm)
def forward(self, x): for i, (w, b) in enumerate(zip(self.lst_w, self.lst_b)): x = F.linear(x, w, b) if i != len(self.lst_w) - 1: x = F.tanh(x) else: return self._out_fn(x)
def forward(self, x_s, x_t, translate=False): """ args x_s: array of padded source sentences. x_t: array of padded target sentences. translate: whether this function used for translate or not. returns dec_out: encoder-decoder model's output. enc_out: encoder's output used for translation. """ length_s, length_t = x_s.shape[1], x_t.shape[1] h_s = self.source_embed(x_s) h_t = self.target_embed(x_t) h_s += self.xp.array(self.position_encoding[None, :length_s]) h_t += self.xp.array(self.position_encoding[None, :length_t]) h_s = F.transpose(h_s, (0, 2, 1)) h_t = F.transpose(h_t, (0, 2, 1)) src_self_mask = self._get_padding_mask(x_s, x_s, self.config.pad_id) tgt_self_mask = self._get_padding_mask(x_t, x_t, self.config.pad_id) tgt_future_mask = self._get_future_mask(x_t) tgt_self_mask *= tgt_future_mask src_tgt_mask = self._get_padding_mask(x_s, x_t, self.config.pad_id) enc_out = self.enc(h_s, src_self_mask) dec_out = self.dec(h_t, enc_out, tgt_self_mask, src_tgt_mask) B, D, L = dec_out.shape dec_out = F.transpose(dec_out, (0, 2, 1)).reshape(B * L, D) dec_out = F.linear(dec_out, self.target_embed.W) if translate: return dec_out, enc_out else: return dec_out
def _translate_forward(self, enc_out, x_s, x_t): """reusing enc_out for efficient calculation. args enc_out: encoder's output (fixed after calculated once) x_s: array of source sentences. Note this x_s is not the same as arg of 'translate' function. x_t: array of target sentences. this arg changes gradually in auto-regression. returns dec_out: decoder's output """ length_t = x_t.shape[1] h_t = self.target_embed(x_t) h_t += self.position_encoding[None, :length_t] h_t = F.transpose(h_t, (0, 2, 1)) tgt_self_mask = self._get_padding_mask(x_t, x_t, self.config.pad_id) tgt_future_mask = self._get_future_mask(x_t) tgt_self_mask *= tgt_future_mask src_tgt_mask = self._get_padding_mask(x_s, x_t, self.config.pad_id) dec_out = self.dec(h_t, enc_out, tgt_self_mask, src_tgt_mask) B, D, L = dec_out.shape dec_out = F.transpose(dec_out, (0, 2, 1)).reshape(B * L, D) dec_out = F.linear(dec_out, self.target_embed.W) return dec_out
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g = F.broadcast_to( F.gaussian(np.array([0], dtype=np.float32), np.array([np.exp(1)], dtype=np.float32)), x.shape) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(-x_g_norm + 2 * x_g_y_g - y_g_norm)
def __call__(self, x, t): h = self.base(x, layers=['res5'])['res5'] self.cam = h h = _global_average_pooling_2d(h) ################################################################################ # ResNet50の後ろにArcFace実装 ################################################################################ # --------------------------- cos(theta) & phi(theta) --------------------------- cosine = F.linear(F.normalize(h), F.normalize(self.weight)) # fc8 sine = F.sqrt(F.clip((1.0 - F.square(cosine)),0, 1)) phi = cosine * cos_m - sine * sin_m if easy_margin: phi = F.where(cosine.data > 0, phi, cosine) else: phi = F.where(cosine.data > th, phi, cosine - mm) # --------------------------- convert label to one-hot --------------------------- one_hot = cp.eye(10)[t].astype(cp.float32) one_hot = Variable(one_hot) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= s ################################################################################ #h = self.fc(h) return output
def __call__(self, x): # Can tie the weights by defining the decoder operation here using the # F.transpose function, and the 'decoder_bias' which we added above. # https://github.com/pfnet/chainer/issues/34 h = F.sigmoid(self.encoder(x)) h = F.linear(h, F.transpose(self.encoder.W), self.decoder_bias) return F.sigmoid(h)
def __call__(self, x): x, t, l = x if chainer.config.train: self.lipschitz = None if getattr(chainer.config, 'lmt', False): if getattr(chainer.config, 'exact', False): if self.lipschitz is None: self.lipschitz = spectral_norm_exact(self.W.data) l = l * self.lipschitz x = super(Linear, self).__call__(x) else: if self.u is None: # for calculation of Lipschitz constant u = np.random.normal(size=(1, x.shape[1])).astype(np.float32) with self.init_scope(): self.u = chainer.Parameter(u) register_power_iter(self.u) if self._device_id is not None and self._device_id >= 0: with chainer.cuda._get_device(self._device_id): self.u.to_gpu() x = super(Linear, self).__call__(x) normalize(self.u.array) u = F.linear(self.u, self.W) l = l * l2_norm(u) else: x = super(Linear, self).__call__(x) return x, t, l
def __init__(self, n_units, n_vocab, encoder, max_memory, hops): super(MemNN, self).__init__() with self.init_scope(): self.embeds = chainer.ChainList() self.temporals = chainer.ChainList() normal = initializers.Normal() # Shares both embeded matrixes in adjacent layres for _ in six.moves.range(hops + 1): self.embeds.append(L.EmbedID(n_vocab, n_units, initialW=normal)) self.temporals.append( L.EmbedID(max_memory, n_units, initialW=normal)) self.memories = [ Memory(self.embeds[i], self.embeds[i + 1], self.temporals[i], self.temporals[i + 1], encoder) for i in six.moves.range(hops) ] # The question embedding is same as the input embedding of the # first layer self.B = self.embeds[0] # The answer prediction matrix W is same as the final output layer self.W = lambda u: F.linear(u, self.embeds[-1].W) self.encoder = encoder self.n_units = n_units self.max_memory = max_memory self.hops = hops
def forward_window(self, a, b, k, cs): # FIXME: u is to be Number?? u = chainer.Variable(range(len(cs))) if any(isinstance(i, cuda.GPUArray) for i in a): u.to_gpu() window_weights = self.forward_window_weight(a, b, k, u) return F.linear(window_weights, cs)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) return F.exp(-x_g_norm + 2 * x_g_y_g - y_g_norm)
def linear(): x = rand((1, 100, 1, 1)) W = rand((200, 100), var=False) b = rand((200, ), var=False) y = F.linear(x, W, b=b) y = y.reshape(1, 200, 1, 1) return {'input': x}, {'out': y}
def __call__(self, x, train=True): h1 = F.dropout(self.activation(self.l1(x)), train=train) if self.tied: return self.activation( F.linear(h1, F.transpose(self.l1.W), self.decoder_bias)) else: return self.activation(self.l2(h1))
def __call__(self,x,seed): if (seed not in self.calledValues): w = F.gaussian(self.muW,self.lnSigmaW) b = F.gaussian(self.muB,self.lnSigmaB) self.calledValues[seed] = (w,b) else: w,b = self.calledValues[seed] return F.linear(x,w,b)
def query(self, question, lengths, y): u = _encode(self.B, question, lengths) u = self.M1.query(u) u = self.M2.query(u) u = self.M3.query(u) #a = self.W(u) a = F.linear(u, self.E4.W) return F.softmax_cross_entropy(a, y), F.accuracy(a, y)
def forward(self, x, decode=False): if not decode: return super().forward(x) else: out = F.linear(x, self.W) if hasattr(self, "dec_mask"): out += self.dec_mask return out
def forward(self, inputs, device): if self.nobias: x, W = inputs b = None else: x, W, b = inputs y = functions.linear(x, W, b, n_batch_axes=self.n_batch_axes) return y,
def query(self, question, lengths): u = _encode(self.B, question, lengths) u = self.M1.query(u) u = self.M2.query(u) u = self.M3.query(u) # a = self.W(u) a = F.linear(u, self.E4.W) return a
def __call__(self, x, train=True): with chainer.using_config('train', True): h1 = F.dropout(self.activation(self.l1(x))) if self.tied: return self.activation( F.linear(h1, F.transpose(self.l1.W), self.decoder_bias)) else: return self.activation(self.l2(h1))
def __call__(self, x): a = self.nac(x) g = F.sigmoid(F.linear(x, self.G, self.b)) ag = g * a log_in = F.log(abs(x) + self.eps) m = F.exp(self.nac(log_in)) md = (1 - g) * m return ag + md
def __call__(self, x): if self.rf == 1: size_out = x.shape[:-1] + (self.nf,) x = F.linear(x.reshape(-1, x.shape[-1]), self.w, self.b) x = x.reshape(*size_out) else: raise NotImplementedError return x
def _calc_distmat(self, h): bs = h.shape[0] h_l2_2 = F.sum(h**2, axis=1) H = F.broadcast_to(h_l2_2, (bs, bs)) H_t = F.transpose(H) XX = F.linear(h, h) return (H_t - 2*XX + H)
def test_1(self): n_batches = 1 # important in_dims = (2, 2) out_dim = 3 x_shape = (n_batches,) + in_dims w_shape = (out_dim, numpy.prod(in_dims),) x = numpy.ones(x_shape, numpy.float32) w = numpy.ones(w_shape, numpy.float32) y = functions.linear(chainer.Variable(x), w) z = functions.sum(y) z.backward()
def __init__(self, cfg, vocab=40990, n_ctx=512): super(Model, self).__init__() self.vocab = vocab with self.init_scope(): self.embed = L.EmbedID(vocab, cfg.n_embd, initializers.Normal(scale=0.02)) self.drop = lambda x: F.dropout(x, cfg.embd_pdrop) block = Block(n_ctx, cfg, scale=True) self.h = chainer.ChainList(*[copy.deepcopy(block) for _ in range(cfg.n_layer)]) self.decoder = lambda x: F.linear(x, self.embed.W) # To reproduce the noise_shape parameter of TF implementation self.clf_dropout = lambda x: F.dropout(x, cfg.clf_pdrop)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g = self.gamma ** 2 z = F.expand_dims((x - y) ** 2, axis=0) o = F.exp(- F.linear(z, g)) return o
def __call__(self, x): """Applies the linear layer. Args: x (~chainer.Variable): Batch of input vectors. Returns: ~chainer.Variable: Output of the linear layer. """ norm = F.batch_l2_norm_squared(self.W) ** 0.5 norm_broadcasted = F.broadcast_to( F.expand_dims(norm, 1), self.W.data.shape) g_broadcasted = F.broadcast_to( F.expand_dims(self.g, 1), self.W.data.shape) return F.linear(x, g_broadcasted * self.W / norm_broadcasted, self.b)
def check_backward(self, x_data, W_data, b_data, y_grad): x = chainer.Variable(x_data) W = chainer.Variable(W_data) b = chainer.Variable(b_data) y = functions.linear(x, W, b) y.grad = y_grad y.backward() func = y.creator f = lambda: func.forward((x.data, W.data, b.data)) gx, gW, gb = gradient_check.numerical_grad( f, (x.data, W.data, b.data), (y.grad,), eps=1e-2) gradient_check.assert_allclose(gx, x.grad) gradient_check.assert_allclose(gW, W.grad) gradient_check.assert_allclose(gb, b.grad)
def __call__(self, x_u_0, x_u_1): """ Parameters ----------------- x_u_0: Variable Feature of unlabeled samples. x_u_1: Variable Feature of unlabeled samples. """ ffnn_u_0 = self.layers["ffnn_u_0"] ffnn_u_1 = self.layers["ffnn_u_1"] f_0 = F.softmax(ffnn_u_0(x_u_0)) f_1 = F.softmax(ffnn_u_1(x_u_1)) mid_outputs_0 = ffnn_u_0.mid_outputs mid_outputs_1 = ffnn_u_1.mid_outputs L = len(self.dims[1:]) similarities = self.similarities.values() # Efficient computation ## sample similarity W^l summed over l W = 0 for l in range(L): W += similarities[l](mid_outputs_0[l], mid_outputs_1[l]) ## class similarity f_0_norm = F.sum(f_0**2, axis=1) f_1_norm = F.sum(f_1**2, axis=1) f_0_f_1 = F.linear(f_0, f_1) f_0_norm, f_0_f_1, f_1_norm= \ F.broadcast( *[f_0_norm, f_0_f_1, F.expand_dims(f_1_norm, 1)]) F_ = f_0_norm - 2 * f_0_f_1 + f_1_norm print(np.max(F_.data)) print(np.min(F_.data)) print(len((np.where(F_.data < 0)[0])), np.prod(F_.data.shape)) loss = F.sum(W * F_) / (self.batch_size * 2) self.loss = loss return loss
def __call__(self, h): if len(h.shape) != 4: return 0 # (b, c, h, w) -> (b, h, w, c) -> (b, h*w, c) h = F.transpose(h, (0, 2, 3, 1)) shape = h.shape b, n, c = shape[0], shape[1]*shape[2], shape[3] h = F.reshape(h, (b, n, c)) s = 0 xp = cuda.get_array_module(h.data) I_ = xp.identity(n) I_ = Variable(to_device(I_, device)) for h_ in h: s += F.sum(F.square(F.linear(h_, h_) - I_)) l = s / (b * n * c) return l
def test_zero(self): n_batch_axes = 0 with self.assertRaises(ValueError): functions.linear(self.x, self.W, n_batch_axes=n_batch_axes)
def forward_window_weight(self, a, b, k, u): w = k - u w = w * w * b return F.linear(a, F.exp(-w))
def __call__(self, x): out = F.linear(x, self.W, self.b) if self.scale is not None: out *= F.broadcast_to(self.scale[None], out.shape) return out
def decode(self, x, train=True): if self.tied: return self.activation(F.linear(x, F.transpose(self.l1.W), self.decoder_bias)) else: return self.activation(self.l2(x))
def __init__(self, model, cfg): super(LMHead, self).__init__() self.n_embd = cfg.n_embd self.decoder = lambda x: F.linear(x, model.embed.W)
def test_negative(self): n_batch_axes = -1 with self.assertRaises(ValueError): functions.linear(self.x, self.W, n_batch_axes=n_batch_axes)