def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list
def set_state(self, c, h): h = split_axis.split_axis(h, self.num_layers, 1, True) c = split_axis.split_axis(c, self.num_layers, 1, True) for layer, c, h in six.moves.zip(self, c, h): assert isinstance(h, chainer.Variable) assert isinstance(c, chainer.Variable) layer.set_state(c, h)
def set_state(self, c=None, h=None): if c is not None: c = split_axis.split_axis(c, self.num_layers, 1, True) if h is not None: h = split_axis.split_axis(h, self.num_layers, 1, True) if 'set_state' in dir(self[0]): for layer_id, layer in enumerate(self): layer_params = inspect.getargspec(layer.set_state)[0] if 'h' in layer_params: if 'c' in layer_params: if h is None: if c is None: layer.set_state(None, None) else: layer.set_state(c[layer_id], None) else: if c is None: layer.set_state(None, h[layer_id]) else: layer.set_state(c[layer_id], h[layer_id]) else: assert c is None if h is None: layer.set_state(None) else: layer.set_state(h[layer_id])
def forward(self, *cshsx): """Returns new cell state and output of Child-Sum TreeLSTM. Args: cshsx (list of :class:`~chainer.Variable`): Variable arguments which include all cell vectors and all output vectors of variable children, and an input vector. Returns: tuple of ~chainer.Variable: Returns :math:`(c_{new}, h_{new})`, where :math:`c_{new}` represents new cell state vector, and :math:`h_{new}` is new output vector. """ cs = cshsx[:len(cshsx) // 2] hs = cshsx[len(cshsx) // 2:-1] x = cshsx[-1] assert(len(cshsx) % 2 == 1) assert(len(cs) == len(hs)) if x is None: if any(c is not None for c in cs): base = [c for c in cs if c is not None][0] elif any(h is not None for h in hs): base = [h for h in hs if h is not None][0] else: raise ValueError('All inputs (cs, hs, x) are None.') batchsize, dtype = base.shape[0], base.dtype x = self.xp.zeros( (batchsize, self.in_size), dtype=dtype) W_x_in = self.W_x(x) W_x_aio_in, W_x_f_in = split_axis.split_axis( W_x_in, [3 * self.state_size], axis=1) if len(hs) == 0: aio_in = W_x_aio_in a, i, o = split_axis.split_axis(aio_in, 3, axis=1) c = sigmoid.sigmoid(i) * tanh.tanh(a) h = sigmoid.sigmoid(o) * tanh.tanh(c) return c, h hs = self._pad_zero_nodes( hs, (x.shape[0], self.state_size), dtype=x.dtype) cs = self._pad_zero_nodes( cs, (x.shape[0], self.state_size), dtype=x.dtype) aio_in = self.W_h_aio(sum(hs)) + W_x_aio_in W_h_fs_in = concat.concat(split_axis.split_axis( self.W_h_f(concat.concat(hs, axis=0)), len(hs), axis=0), axis=1) f_in = W_h_fs_in + \ concat.concat([W_x_f_in] * len(hs), axis=1) tree_lstm_in = concat.concat([aio_in, f_in], axis=1) return tree_lstm.tree_lstm(*(cs + (tree_lstm_in, )))
def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = (linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list
def __call__(self, c, h): """Updates the internal state and returns the Cell outputs. Remember to treat this Grid cell as if its an LSTM and pass ``c`` as well as ``h``. Only parts of ``c`` will be used depending on whether there is a LSTM or not. Args: c (~chainer.Variable): The previous memory information. h (~chainer.Variable): The previous state information. Returns: tuple of ~chainer.Variable: Returns ``(c_new, h_new)``, where ``c_new`` represents new cell state, and ``h_new`` is updated output. Parts of ``c_new`` will be useless. """ assert h is not None assert c is not None c = split_axis.split_axis(c, self.out_indices, 1, True) h_list = [] h_curr = None for layer_id, layer in enumerate(self): layer_params = inspect.getargspec(layer)[0] if 'c' in layer_params: h_curr = layer(c[layer_id], h) else: h_curr = (c[layer_id], layer(h)) h_list.append(h_curr) h_new = concat.concat([x[1] for x in h_list], 1) c_new = concat.concat([x[0] for x in h_list], 1) return c_new, h_new
def _gru(x, h, c, w, b): xw = concat.concat([w[0], w[1], w[2]], axis=0) hw = concat.concat([w[3], w[4], w[5]], axis=0) xb = concat.concat([b[0], b[1], b[2]], axis=0) hb = concat.concat([b[3], b[4], b[5]], axis=0) gru_x = linear.linear(x, xw, xb) gru_h = linear.linear(h, hw, hb) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) return (1 - z) * h_bar + z * h, None
def argmax_crf1d(cost, xs): alpha = xs[0] alphas = [] max_inds = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) else: alphas.append(None) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) scores = b_alpha + b_cost max_ind = minmax.argmax(scores, axis=1) max_inds.append(max_ind) alpha = minmax.max(scores, axis=1) + x inds = minmax.argmax(alpha, axis=1) path = [inds.data] for m, a in zip(max_inds[::-1], alphas[::-1]): inds = select_item.select_item(m, inds) if a is not None: inds = concat.concat([inds, minmax.argmax(a, axis=1)], axis=0) path.append(inds.data) path.reverse() score = minmax.max(alpha, axis=1) for a in alphas[::-1]: if a is None: continue score = concat.concat([score, minmax.max(a, axis=1)], axis=0) return score, path
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp with cuda.get_device_from_id(self._device_id): self.c = variable.Variable( xp.zeros((x.shape[0], self.state_size), dtype=x.dtype)) lstm_in = reshape.reshape( lstm_in, (len(lstm_in.data), lstm_in.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, (len(a.data), a.shape[1])) i = reshape.reshape(i, (len(i.data), i.shape[1])) f = reshape.reshape(f, (len(f.data), f.shape[1])) o = reshape.reshape(o, (len(o.data), o.shape[1])) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((x.shape[0], self.state_size), dtype=x.dtype), volatile="auto") lstm_in = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, (len(a.data), a.shape[1])) i = reshape.reshape(i, (len(i.data), i.shape[1])) f = reshape.reshape(f, (len(f.data), f.shape[1])) o = reshape.reshape(o, (len(o.data), o.shape[1])) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def argmax_crf1d(cost, xs): """Computes a state that maximizes a joint probability of the given CRF. Args: cost (:class:`~chainer.Variable` or :ref:`ndarray`): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. Returns: tuple: A tuple of :class:`~chainer.Variable` object ``s`` and a :class:`list` ``ps``. The shape of ``s`` is ``(B,)``, where ``B`` is the mini-batch size. i-th element of ``s``, ``s[i]``, represents log-likelihood of i-th data. ``ps`` is a list of :ref:`ndarray`, and denotes the state that maximizes the point probability. ``len(ps)`` is equal to ``len(xs)``, and shape of each ``ps[i]`` is the mini-batch size of the corresponding ``xs[i]``. That means, ``ps[i].shape == xs[i].shape[0:1]``. """ alpha = xs[0] alphas = [] max_inds = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) else: alphas.append(None) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) scores = b_alpha + b_cost max_ind = minmax.argmax(scores, axis=1) max_inds.append(max_ind) alpha = minmax.max(scores, axis=1) + x inds = minmax.argmax(alpha, axis=1) path = [inds.data] for m, a in zip(max_inds[::-1], alphas[::-1]): inds = select_item.select_item(m, inds) if a is not None: inds = concat.concat([inds, minmax.argmax(a, axis=1)], axis=0) path.append(inds.data) path.reverse() score = minmax.max(alpha, axis=1) for a in alphas[::-1]: if a is None: continue score = concat.concat([score, minmax.max(a, axis=1)], axis=0) return score, path
def forward(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp with chainer.using_device(self.device): self.c = variable.Variable( xp.zeros((len(x), self.state_size), dtype=x.dtype)) lstm_in = reshape.reshape( lstm_in, (len(lstm_in), lstm_in.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, a.shape[:2]) i = reshape.reshape(i, i.shape[:2]) f = reshape.reshape(f, f.shape[:2]) o = reshape.reshape(o, o.shape[:2]) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def argmax_crf1d(cost, xs): """Computes a state that maximizes a joint probability of the given CRF. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. Returns: tuple: A tuple of :class:`~chainer.Variable` object ``s`` and a :class:`list` ``ps``. The shape of ``s`` is ``(B,)``, where ``B`` is the mini-batch size. i-th element of ``s``, ``s[i]``, represents log-likelihood of i-th data. ``ps`` is a list of :class:`numpy.ndarray` or :class:`cupy.ndarray`, and denotes the state that maximizes the point probability. ``len(ps)`` is equal to ``len(xs)``, and shape of each ``ps[i]`` is the mini-batch size of the corresponding ``xs[i]``. That means, ``ps[i].shape == xs[i].shape[0:1]``. """ alpha = xs[0] alphas = [] max_inds = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) else: alphas.append(None) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) scores = b_alpha + b_cost max_ind = minmax.argmax(scores, axis=1) max_inds.append(max_ind) alpha = minmax.max(scores, axis=1) + x inds = minmax.argmax(alpha, axis=1) path = [inds.data] for m, a in zip(max_inds[::-1], alphas[::-1]): inds = select_item.select_item(m, inds) if a is not None: inds = concat.concat([inds, minmax.argmax(a, axis=1)], axis=0) path.append(inds.data) path.reverse() score = minmax.max(alpha, axis=1) for a in alphas[::-1]: if a is None: continue score = concat.concat([score, minmax.max(a, axis=1)], axis=0) return score, path
def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = ( linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list
def __call__(self, x): xs = split_axis.split_axis(x, x.data.shape[1], 1) ret = [] for x in xs: for l in self: x = l(x) ret.append(x) return ret
def __call__(self, h, x): # We compute r_x, z_x and h_x simultaneously r_z_h_x = self.W_r_z_h(x) r_x, z_x, h_x = split_axis.split_axis( r_z_h_x, (self.n_units, self.n_units * 2), axis=1) # We compute r_h and z_h simultaneously r_z_h = self.U_r_z(h) r_h, z_h = split_axis.split_axis(r_z_h, (self.n_units,), axis=1) # finally we compute the output using the optimized functions return compute_output_GRU( z_x, z_h, h_x, h, self.U(sigmoid_a_plus_b_multiplied_by_c(r_x, r_h, h)) )
def _one_directional_loop(f, xs, h, c, w, b): h_list = [] for x in xs: batch = len(x) need_split = len(h) > batch if need_split: h, h_rest = split_axis.split_axis(h, [batch], axis=0) if c is not None: c, c_rest = split_axis.split_axis(c, [batch], axis=0) h, c = f(x, h, c, w, b) h_list.append(h) if need_split: h = concat.concat([h, h_rest], axis=0) if c is not None: c = concat.concat([c, c_rest], axis=0) return h, c, h_list
def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) counter = 0 for x in xs_list: counter += 1 batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio) if counter == 4: lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) else: lstm_in = linear.linear( x, xws[layer_idx], xbs[layer_idx]) + linear.linear( h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list
def __call__(self, x, train=True): x = reshape.reshape(x, (len(x.data), 1) + x.data.shape[1:]) x = self.convolution(x, train) xs = split_axis.split_axis(x, x.data.shape[2], 2) for x in xs: x.data = self.xp.ascontiguousarray(x.data) for r in self.recurrent: r.reset_state() xs = self.recurrent(xs, train) xs = self._linear(xs, train) return xs
def __call__(self, x): x = self.embed(x) xs = split_axis.split_axis(x, x.data.shape[1], 1) ret = [] for x in xs: x = self.rnn1(x) x = self.rnn2(x) x = self.linear(x) x = reshape.reshape(x, x.data.shape + (-1, )) ret.append(x) ret = concat.concat(ret, axis=2) return ret
def forward(self, x, y): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.has_uninitialized_params: in_size = x.size // x.shape[0] self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than the ' 'size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable( xp.zeros((batch, self.state_size), dtype=x.dtype), volatile='auto') r = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4) + lstm_in.data.shape[2:]) a, i, f, o = [r[:, :, i] for i in range(4)] # self.c, y = lstm.lstm(self.c,lstm_in) a = tanh.tanh(a) # tanh.tanh(a) i = sigmoid.sigmoid(i) f = sigmoid.sigmoid(f) o = sigmoid.sigmoid(o) self.c = a * i + f * self.c + tanh(self.w_y(y)) self.h = o * tanh.tanh(self.c) return self.h
def separate(x, axis=0): """Separates an array along a given axis. This function separates an array along a given axis. For example, shape of an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it returns three ``(2, 4)`` arrays. This function is an inverse of :func:`chainer.functions.stack`. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Variable to be separated. A :math:`(s_1, s_2, ..., s_N)` -shaped float array. axis (int): Axis along which variables are separated. Returns: tuple of chainer.Variable: Output variables. .. seealso:: :func:`chainer.functions.stack` .. admonition:: Example >>> x = np.arange(6).reshape((2, 3)).astype('f') >>> x array([[ 0., 1., 2.], [ 3., 4., 5.]], dtype=float32) >>> x.shape (2, 3) >>> y = F.separate(x) # split along axis=0 >>> type(y) <class 'tuple'> >>> len(y) 2 >>> y[0].shape (3,) >>> y[0].data array([ 0., 1., 2.], dtype=float32) >>> y = F.separate(x, axis=1) >>> len(y) 3 >>> y[0].shape (2,) >>> y[0].data array([ 0., 3.], dtype=float32) """ shape = list(x.shape) del shape[axis] ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True) return tuple(reshape.reshape(y, shape) for y in ys)
def separate(x, axis=0): """Separates an array along a given axis. This function separates an array along a given axis. For example, shape of an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it returns three ``(2, 4)`` arrays. This function is an inverse of :func:`chainer.functions.stack`. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Variable to be separated. A :math:`(s_1, s_2, ..., s_N)` -shaped float array. axis (int): Axis along which variables are separated. Returns: tuple of chainer.Variable: Output variables. .. seealso:: :func:`chainer.functions.stack` .. admonition:: Example >>> x = np.arange(6).reshape((2, 3)).astype('f') >>> x array([[ 0., 1., 2.], [ 3., 4., 5.]], dtype=float32) >>> x.shape (2, 3) >>> y = F.separate(x) # split along axis=0 >>> isinstance(y, tuple) True >>> len(y) 2 >>> y[0].shape (3,) >>> y[0].data array([ 0., 1., 2.], dtype=float32) >>> y = F.separate(x, axis=1) >>> len(y) 3 >>> y[0].shape (2,) >>> y[0].data array([ 0., 3.], dtype=float32) """ shape = list(x.shape) del shape[axis] ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True) return tuple(reshape.reshape(y, shape) for y in ys)
def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list
def __call__(self, x): x = self.embed(x) xs = split_axis.split_axis(x, x.data.shape[1], 1) ret = [] for x in xs: for l in self.rnns: x = l(x) x = dropout.dropout(x, 0.25, self.train) for l in self.linears: x = l(x) x = reshape.reshape(x, x.data.shape + (-1, )) ret.append(x) ret = concat.concat(ret, axis=2) return ret
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.has_uninitialized_params: in_size = x.size // x.shape[0] self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than the ' 'size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis(self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((batch, self.state_size), dtype=x.dtype), volatile='auto') # self.c, y = lstm.lstm(self.c, lstm_in) c, y = lstm.lstm(self.c, lstm_in) enable = (x.data != -1) self.c = where(enable, c, self.c) if self.h is not None: y = where(enable, y, self.h) if h_rest is None: self.h = y elif len(y.data) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.has_uninitialized_params: with cuda.get_device_from_id(self._device_id): in_size = x.size // x.shape[0] self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than' 'the size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp with cuda.get_device_from_id(self._device_id): self.c = variable.Variable( xp.zeros((batch, self.state_size), dtype=x.dtype), volatile='auto') self.c, y = lstm.lstm(self.c, lstm_in) if h_rest is None: self.h = y elif len(y.data) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.W.data is None: with cuda.get_device_from_id(self._device_id): in_size = functools.reduce(operator.mul, x.shape[1:], 1) self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than' 'the size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis(self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp with cuda.get_device_from_id(self._device_id): self.c = variable.Variable( xp.zeros((batch, self.state_size), dtype=x.dtype)) self.c, y = lstm.lstm(self.c, lstm_in) if h_rest is None: self.h = y elif len(y.data) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def forward(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.W.array is None: with chainer.using_device(self.device): in_size = utils.size_of_shape(x.shape[1:]) self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than' 'the size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: with chainer.using_device(self.device): self.c = variable.Variable( self.xp.zeros((batch, self.state_size), dtype=x.dtype)) self.c, y = lstm.lstm(self.c, lstm_in) if h_rest is None: self.h = y elif len(y.array) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) else: xp = self.xp with cuda.get_device(self._device_id): self.h = variable.Variable( xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype), volatile='auto') if self.c is None: xp = self.xp with cuda.get_device(self._device_id): self.c = variable.Variable( xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype), volatile='auto') lstm_in = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, (len(a.data), self.state_size)) i = reshape.reshape(i, (len(i.data), self.state_size)) f = reshape.reshape(f, (len(f.data), self.state_size)) o = reshape.reshape(o, (len(o.data), self.state_size)) c_tmp = tanh.tanh(a) * sigmoid.sigmoid(i) + sigmoid.sigmoid(f) * self.c self.c = zoneout.zoneout(self.c, c_tmp, self.c_ratio, self.train) self.h = zoneout.zoneout(self.h, sigmoid.sigmoid(o) * tanh.tanh(c_tmp), self.h_ratio, self.train) return self.h
def __call__(self, h, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. h (~chainer.Variable): The list of the previous cell outputs. Returns: ~chainer.Variable: A list of the outputs (h) of the updated LSTM units over all the layers. """ h_list = [] h = split_axis.split_axis(h, self.num_layers, 1, True) h_curr = x for layer, h in six.moves.zip(self, h): h_curr = layer(h, h_curr) h_list.append(h_curr) return concat.concat(h_list, 1)
def forward(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.W.array is None: with chainer.using_device(self.device): in_size = utils.size_of_shape(x.shape[1:]) self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = x # lstm_in = self.upward(x) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than' 'the size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: with chainer.using_device(self.device): self.c = variable.Variable( self.xp.zeros((batch, self.state_size), dtype=x.dtype)) self.c, = lstm(self.c, lstm_in) return self.c
def __call__(self, batchsize): """.... Args: eps (~chainer.Variable): a wsize-length vector whose elements are drawn from normal distribution (mean = 0, std = 1). batchsize (~chainer.Variable): (batch size) * (number of truncated backward gradient calculation for a training dataset) Returns: ~chainer.Variable: Output of the linear layer. """ """ self.m_hat = reshape.reshape(sum.sum(self.M)/self.M.data.shape[0], (1,1)) M, m_hat = broadcast.broadcast(self.M, self.m_hat) self.s2_hat = sum.sum(self.S2 + (M - m_hat)*(M - m_hat))/self.M.data.shape[0] print('m_hat.data {}'.format(self.m_hat.data)) print('self.s2_hat.data {}'.format(self.s2_hat.data)) print('self.S2.data {}'.format(self.S2.data)) print('self.M.data {}'.format(self.M.data)) print('------------------') """ self.fWb, loss = adaptive_weight_noise(batchsize, self.M, self.logS2, self.use_weight_noise) if self.nobias: return reshape.reshape(self.fWb, (self.out_size, self.in_size)), loss else: self.fW, self.fb = split_axis.split_axis( self.fWb, numpy.asarray([(self.in_size - 1) * self.out_size]), axis=0) return reshape.reshape( self.fW, (self.out_size, self.in_size - 1)), self.fb, loss
def separate(x, axis=0): """Separates an array along a given axis. This function separates an array along a given axis. For example, shape of an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it returns three ``(2, 4)`` arrays. This function is an inverse of :func:`chainer.functions.stack`. Args: x (chainer.Variable): Variable to be separated. axis (int): Axis along which variables are separated. Returns: tuple of chainer.Variable: Output variables. .. seealso:: :func:`chainer.functions.stack` """ shape = list(x.shape) del shape[axis] ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True) return tuple(reshape.reshape(y, shape) for y in ys)
def __call__(self, c, h): """Updates the internal state and returns the Cell outputs. Args: c (~chainer.Variable): The previous memory of the Grid cell. h (~chainer.Variable): The batched form of the previous state. Returns: tuple of ~chainer.Variable: Returns ``(c_new, h_new)``, where ``c_new`` represents new cell state, and ``h_new`` is updated output of LSTM units. """ assert h is not None assert c is not None c = split_axis.split_axis(c, self.out_indices, 1, True) h_list = [] h_curr = None for layer_id, layer in enumerate(self): h_curr = layer(c[layer_id], h) h_list.append(h_curr) h_new = concat.concat([x[1] for x in h_list], 1) c_new = concat.concat([x[0] for x in h_list], 1) return c_new, h_new
def __call__(self, x,Whx,Wmx,Wmh,Whm): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ # if self.upward.has_uninitialized_params: # in_size = x.size // x.shape[0] # self.upward._initialize_params(in_size) # self._initialize_params() # if self.upward2.has_uninitialized_params: # in_size = x.size // x.shape[0] # self.upward2._initialize_params(in_size) # self._initialize_params() batch = x.shape[0] # Whx = self.upward() # Wmx = self.upward2() factor_in = F.linear(x,Wmx) lstm_in = F.linear(x,Whx,self.b) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than the ' 'size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) # Wmh = self.lateral1() mult_in = F.linear(h_update,Wmh) mult_out = mult_in*factor_in # Whm = self.lateral2() lstm_in += F.linear(mult_out,Whm) else: # Wmh = self.lateral1() mult_in = F.linear(self.h,Wmh) mult_out = mult_in*factor_in # Whm = self.lateral2() lstm_in += F.linear(mult_out,Whm) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((batch, self.state_size), dtype=x.dtype),volatile='auto') self.c, y = lstm.lstm(self.c, lstm_in) if h_rest is None: self.h = y elif len(y.data) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn, use_bi_direction): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiLSTM(n_layers, states, train=train) else: rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward)] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def __call__(self, c=None, h=None, x=None, top_n=None): """Updates the internal state and returns the Cell outputs. Args: x (~chainer.Variable): A new batch from the input sequence. h (~chainer.Variable): The batched form of the previous state. Make sure that you pass the previous state if you use stateless RNN cells top_n (int): The number of cells from the top whose outputs you want (default: outputs of all GRUs are returned) When using stateless cells the states of all cells will be returned as they will be needed for the next step Returns: ~chainer.Variable: A concatenation of the outputs (h) of the updated cell units over the top N layers; by default all layers are considered. OR (~chainer.Variable, ~chainer.Variable): A tuple of concatenation of the outputs (h) and memories (c) of the updated cell units over the top N layers; by default all layers are considered. """ assert x is not None if top_n is None: top_n = self.num_layers if h is not None: assert top_n is self.num_layers h = split_axis.split_axis(h, self.num_layers, 1, True) if c is not None: c = split_axis.split_axis(c, self.num_layers, 1, True) h_list = [] h_curr = x for layer_id, layer in enumerate(self): layer_params = inspect.getargspec(layer)[0] if 'h' in layer_params: if 'c' in layer_params: if h is None: if c is None: h_curr = layer(None, None, h_curr) else: h_curr = layer(c[layer_id], None, h_curr) else: if c is None: h_curr = layer(None, h[layer_id], h_curr) else: h_curr = layer(c[layer_id], h[layer_id], h_curr) else: assert c is None if h is None: h_curr = layer(None, h_curr) else: h_curr = layer(h[layer_id], h_curr) else: assert c is None assert h is None h_curr = layer(h_curr) h_list.append(h_curr) if len(h_list[0]) == 2: h_out = concat.concat([y[1] for y in h_list[-top_n:]], 1) c_out = concat.concat([y[0] for y in h_list[-top_n:]], 1) return c_out, h_out else: return concat.concat(h_list[-top_n:], 1)
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction, **kwargs): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): The number of layers. dropout_ratio(float): Dropout ratio. hx (~chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is the number of layers and is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N`` is the dimension of the hidden units. cx (~chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of :class:`~chainer.Variable`): Weight matrices. ``ws[i]`` represents the weights for the i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` corresponds to :math:`W_j` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they are multiplied with input variables, where ``I`` is the size of the input and ``N`` is the dimension of the hidden units. All other matrices are ``(N, N)``-shaped. bs (list of list of :class:`~chainer.Variable`): Bias vectors. ``bs[i]`` represents the biases for the i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` corresponds to :math:`b_j` in the equation. The shape of each matrix is ``(N,)``. xs (list of :class:`~chainer.Variable`): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the mini-batch size for time ``t``. The sequences must be transposed. :func:`~chainer.functions.transpose_sequence` can be used to transpose a list of :class:`~chainer.Variable`\\ s each representing a sequence. When sequences has different lengths, they must be sorted in descending order of their lengths before transposing. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is the same as ``hx``. - ``cy`` is an updated cell states whose shape is the same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is the mini-batch size for time ``t``. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs,))) if use_bi_direction: rnn = NStepBiLSTM else: rnn = NStepLSTM hy, cy, ys = rnn(n_layers, states, lengths)(*inputs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward)] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def crf1d(cost, xs, ys, reduce='mean'): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. .. note:: When you want to calculate the negative log-likelihood of sequences which have different lengths, sort the sequences in descending order of lengths and transpose the sequences. For example, you have three input sequences: >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> a = [a1, a2, a3, a4] >>> b = [b1, b2, b3] >>> c = [c1, c2] where ``a1`` and all other variables are arrays with ``(K,)`` shape. Make a transpose of the sequences: >>> x1 = np.stack([a1, b1, c1]) >>> x2 = np.stack([a2, b2, c2]) >>> x3 = np.stack([a3, b3]) >>> x4 = np.stack([a4]) and make a list of the arrays: >>> xs = [x1, x2, x3, x4] You need to make label sequences in the same fashion. And then, call the function: >>> cost = chainer.Variable( ... np.random.uniform(-1, 1, (3, 3)).astype(np.float32)) >>> ys = [np.zeros(x.shape[0:1], dtype=np.int32) for x in xs] >>> loss = F.crf1d(cost, xs, ys) It calculates mean of the negative log-likelihood of the three sequences. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'mean'``, it holds mean of the loss values. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. ys (list of Variable): Expected output labels. It needs to have the same length as ``xs``. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. When ``x`` in ``xs`` has the different :math:`B`, correspoding ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <https://repository.upenn.edu/cis_papers/159/>`_. """ if reduce not in ('mean', 'no'): raise ValueError( "only 'mean' and 'no' are valid for 'reduce', but '%s' is " 'given' % reduce) assert xs[0].shape[1] == cost.shape[0] n_label = cost.shape[0] n_batch = xs[0].shape[0] alpha = xs[0] alphas = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x if len(alphas) > 0: alphas.append(alpha) alpha = concat.concat(alphas[::-1], axis=0) logz = logsumexp.logsumexp(alpha, axis=1) cost = reshape.reshape(cost, (cost.size, 1)) score = select_item.select_item(xs[0], ys[0]) scores = [] for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]): batch = x.shape[0] if score.shape[0] > batch: y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0) score, score_rest = split_axis.split_axis(score, [batch], axis=0) scores.append(score_rest) score += (select_item.select_item(x, y) + reshape.reshape( embed_id.embed_id(y_prev * n_label + y, cost), (batch,))) if len(scores) > 0: scores.append(score) score = concat.concat(scores[::-1], axis=0) loss = logz - score if reduce == 'mean': return _sum.sum(loss) / n_batch else: return loss
def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = h0[layer_idx] # h:d_bar_s_1 # h_bar:d_s ''' print(len(xs_list)) print(len(xs_list[0])) print(len(xs_list[0][0])) ''' h_list = [] h_bar_list = [] c_s_list = [] z_s_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h phi_d = linear.linear(h_bar, W2, B2) ''' print(type(phi_ht), len(phi_ht)) print(type(phi_ht[0]), len(phi_ht[0])) print(type(phi_ht[0][0]), len(phi_ht[0][0])) print(type(phi_d), len(phi_d)) print(type(phi_d[0]), len(phi_d[0]), phi_d[0].shape) ''' #phi_ht_len = [t.shape[1] for t in phi_ht] #phi_ht_section = np.cumsum(phi_ht_len[:-1]) #concat_phi_ht = F.concat(phi_ht, axis=1) #concat_phi_d = [F.concat([phi_d[i]]*phi_ht_len[i], axis=0) for i in range(batch)] #concat_phi_d = F.concat(concat_phi_d, axis=0) #concat_phi_d = F.concat(F.transpose(phi_d), axis=0) u_st = list( map( lambda x, y: reshape.reshape((linear.linear( x, reshape.reshape(y, (1, len(y))))), (len(x), )), phi_ht, phi_d)) #(4) sum_u = list(map(F.sum, u_st)) alpha_st = list( map(lambda x, y: x / F.broadcast_to(y, x.shape), u_st, sum_u)) #(3) z_s = list(map(F.argmax, alpha_st)) z_s = list(map(lambda x: F.broadcast_to(x, (1, )), z_s)) z_s = F.concat(z_s, axis=0) ''' print(type(alpha_st),len(alpha_st)) print(type(alpha_st[0]),len(alpha_st[0])) print(alpha_st[0].shape) print(ht[0].shape) ''' c_s = list( map( lambda x, y: F.sum(F.broadcast_to( reshape.reshape(x, (x.shape[0], 1)), y.shape) * y, axis=0), alpha_st, ht)) #(2) c_s_2d = list( map(lambda x: reshape.reshape(x, (1, len(x))), c_s)) concat_c_s = F.concat(c_s_2d, axis=0) c_s = list( map(lambda x: F.broadcast_to(x, (1, len(x))), c_s)) c_s = F.concat(c_s, axis=0) ''' print(type(c_s), len(c_s)) print(type(c_s[0]), len(c_s[0]), c_s[0].shape) ''' h = F.relu( linear.linear(F.concat([concat_c_s, h_bar], axis=1), W3, B3)) h_list.append(h) h_bar_list.append(h_bar) c_s_list.append(c_s) z_s_list.append(z_s) #単語数の違いを担保 if h_rest is not None: h = concat.concat([h, h_rest], axis=0) h_bar = concat.concat([h_bar, h_rest], axis=0) return h_list, h_bar_list, c_s_list, z_s_list
def n_step_lstm(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t = \sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) f_t = \sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) o_t = \sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) a_t = \tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) c_t = f_t \dot c_{t-1} + i_t \dot a_t h_t = o_t \dot \tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:~chainer.Variable. Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiGRU(n_layers, states) else: rnn = NStepGRU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws] hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws] xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs] hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward GRU h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward GRU h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward) ] hy.append(h) else: # Uni-directional GRU xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def crf1d(cost, xs, ys, reduce='mean'): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. .. note:: When you want to calculate the negative log-likelihood of sequences which have different lengths, sort the sequences in descending order of lengths and transpose the sequences. For example, you have three input sequences: >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype('f') >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype('f') >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype('f') >>> a = [a1, a2, a3, a4] >>> b = [b1, b2, b3] >>> c = [c1, c2] where ``a1`` and all other variables are arrays with ``(K,)`` shape. Make a transpose of the sequences: >>> x1 = np.stack([a1, b1, c1]) >>> x2 = np.stack([a2, b2, c2]) >>> x3 = np.stack([a3, b3]) >>> x4 = np.stack([a4]) and make a list of the arrays: >>> xs = [x1, x2, x3, x4] You need to make label sequences in the same fashion. And then, call the function: >>> cost = chainer.Variable( ... np.random.uniform(-1, 1, (3, 3)).astype('f')) >>> ys = [np.zeros(x.shape[0:1], dtype='i') for x in xs] >>> loss = F.crf1d(cost, xs, ys) It calculates mean of the negative log-likelihood of the three sequences. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'mean'``, it holds mean of the loss values. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B` s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. ys (list of Variable): Expected output labels. It needs to have the same length as ``xs``. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. When ``x`` in ``xs`` has the different :math:`B`, correspoding ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <http://repository.upenn.edu/cis_papers/159/>`_. """ if reduce not in ('mean', 'no'): raise ValueError( "only 'mean' and 'no' are valid for 'reduce', but '%s' is " 'given' % reduce) assert xs[0].shape[1] == cost.shape[0] n_label = cost.shape[0] n_batch = xs[0].shape[0] alpha = xs[0] alphas = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x if len(alphas) > 0: alphas.append(alpha) alpha = concat.concat(alphas[::-1], axis=0) logz = logsumexp.logsumexp(alpha, axis=1) cost = reshape.reshape(cost, (cost.size, 1)) score = select_item.select_item(xs[0], ys[0]) scores = [] for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]): batch = x.shape[0] if score.shape[0] > batch: y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0) score, score_rest = split_axis.split_axis(score, [batch], axis=0) scores.append(score_rest) score += ( select_item.select_item(x, y) + reshape.reshape(embed_id.embed_id(y_prev * n_label + y, cost), (batch, ))) if len(scores) > 0: scores.append(score) score = concat.concat(scores[::-1], axis=0) loss = logz - score if reduce == 'mean': return _sum.sum(loss) / n_batch else: return loss
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepBiRNNReLU(n_layers, states) else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepRNNReLU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [_stack_weight([w[0]]) for w in ws] hws = [_stack_weight([w[1]]) for w in ws] xbs = [_stack_weight([b[0]]) for b in bs] hbs = [_stack_weight([b[1]]) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = (linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def fixed_length_n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, ): xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and cuda.cudnn_enabled and _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs, ))) rnn = FixedLengthNStepLSTMFunction(n_layers, states, train=train) ret = rnn(*inputs) hy, cy, ys = ret _, batch_size, dim = hy.shape ys_reshape = F.reshape(ys, (-1, batch_size, dim)) # (length, batch, dim) return hy, cy, ys_reshape else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio) h = dropout.dropout(h, ratio=dropout_ratio) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) #return hy, cy, tuple(ys) ys_concat = F.concat(ys, axis=0) ys_reshape = F.reshape( ys_concat, (-1, ys[0].shape[0], ys[0].shape[1])) # (length, batch, dim) return hy, cy, ys_reshape
def n_step_lstm_base(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn, use_bi_direction): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiLSTM(n_layers, states, train=train) else: rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward) ] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiGRU(n_layers, states) else: rnn = NStepGRU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws] hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws] xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs] hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward GRU h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward GRU h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional GRU xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t &= \\sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) \\\\ f_t &= \\sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) \\\\ o_t &= \\sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) \\\\ a_t &= \\tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) \\\\ c_t &= f_t \\dot c_{t-1} + i_t \\dot a_t \\\\ h_t &= o_t \\dot \\tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepBiRNNReLU(n_layers, states) else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepRNNReLU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [_stack_weight([w[0]]) for w in ws] hws = [_stack_weight([w[1]]) for w in ws] xbs = [_stack_weight([b[0]]) for b in bs] hbs = [_stack_weight([b[1]]) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = ( linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward) ] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)