def call(self, x, params, state, **kwargs): del kwargs seqlen = x.shape[1] d_head = x.shape[2] x = np.reshape(x, (-1, self._n_heads, seqlen, d_head)) x = np.transpose(x, (0, 2, 1, 3)) # -> n_batch, seqlen, n_heads, d_head x = np.reshape(x, (-1, seqlen, self._n_heads * d_head)) return np.dot(x, params), state
def call(self, x, params, state, **kwargs): del kwargs seqlen = x.shape[1] res = np.dot(x, params) # n_batch, seqlen, n_heads*d_head -> n_batch, seqlen, n_heads, d_head res = np.reshape(res, (x.shape[0], seqlen, self._n_heads, self._d_head)) # n_batch, seqlen, n_heads, d_head -> n_batch, n_heads, seqlen, d_head res = np.transpose(res, (0, 2, 1, 3)) # n_batch, n_heads, seqlen, d_head -> n_batch*n_heads, seqlen, d_head res = np.reshape(res, (-1, seqlen, self._d_head)) return res, state
def call(self, x, params, **kwargs): del kwargs w, b = params return np.dot(x, w) + b
def call(self, params, inputs, **kwargs): del kwargs w, b = params return np.dot(inputs, w) + b
def apply_fun(params, inputs, **kwargs): del kwargs # unused w, b = params return np.dot(inputs, w) + b
def apply_fun(params, inputs, **kwargs): W, b = params return np.dot(inputs, W) + b
def forward(self, x, params=(), state=(), **kwargs): del kwargs w, b = params return np.dot(x, w) + b, state