def n_step_rnn_impl( f, n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction): direction = 2 if use_bi_direction else 1 hx = chainer.functions.separate(hx) use_cell = cx is not None if use_cell: cx = chainer.functions.separate(cx) else: cx = [None] * len(hx) xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): # Forward RNN if layer == 0: xs = xs_next else: xs = _dropout_sequence(xs_next, dropout_ratio) idx = direction * layer h, c, h_forward = _one_directional_loop( f, xs, hx[idx], cx[idx], ws[idx], bs[idx]) hy.append(h) cy.append(c) if use_bi_direction: # Backward RNN idx = direction * layer + 1 if layer == 0: xs = xs_next else: xs = _dropout_sequence(xs_next, dropout_ratio) h, c, h_backward = _one_directional_loop( f, reversed(xs), hx[idx], cx[idx], ws[idx], bs[idx]) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for hfi, hbi in six.moves.zip(h_forward, h_backward)] hy.append(h) cy.append(c) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) if use_cell: cy = stack.stack(cy) else: cy = None return hy, cy, tuple(ys)
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn, use_bi_direction): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiLSTM(n_layers, states, train=train) else: rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward)] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction, **kwargs): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): The number of layers. dropout_ratio(float): Dropout ratio. hx (~chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is the number of layers and is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N`` is the dimension of the hidden units. cx (~chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of :class:`~chainer.Variable`): Weight matrices. ``ws[i]`` represents the weights for the i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` corresponds to :math:`W_j` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they are multiplied with input variables, where ``I`` is the size of the input and ``N`` is the dimension of the hidden units. All other matrices are ``(N, N)``-shaped. bs (list of list of :class:`~chainer.Variable`): Bias vectors. ``bs[i]`` represents the biases for the i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` corresponds to :math:`b_j` in the equation. The shape of each matrix is ``(N,)``. xs (list of :class:`~chainer.Variable`): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the mini-batch size for time ``t``. The sequences must be transposed. :func:`~chainer.functions.transpose_sequence` can be used to transpose a list of :class:`~chainer.Variable`\\ s each representing a sequence. When sequences has different lengths, they must be sorted in descending order of their lengths before transposing. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is the same as ``hx``. - ``cy`` is an updated cell states whose shape is the same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is the mini-batch size for time ``t``. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs,))) if use_bi_direction: rnn = NStepBiLSTM else: rnn = NStepLSTM hy, cy, ys = rnn(n_layers, states, lengths)(*inputs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward)] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def _stack_weight(ws): # TODO(unno): Input of the current LSTM implementaiton is shuffled w = stack.stack(ws, axis=1) shape = w.shape return reshape.reshape(w, (shape[0] * shape[1],) + shape[2:])
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiGRU(n_layers, states) else: rnn = NStepGRU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws] hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws] xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs] hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward GRU h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward GRU h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional GRU xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiGRU(n_layers, states) else: rnn = NStepGRU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws] hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws] xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs] hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward GRU h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward GRU h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward) ] hy.append(h) else: # Uni-directional GRU xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def n_step_lstm(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t = \sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) f_t = \sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) o_t = \sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) a_t = \tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) c_t = f_t \dot c_{t-1} + i_t \dot a_t h_t = o_t \dot \tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:~chainer.Variable. Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def n_step_lstm_base(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn, use_bi_direction): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiLSTM(n_layers, states, train=train) else: rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward) ] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepBiRNNReLU(n_layers, states) else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepRNNReLU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [_stack_weight([w[0]]) for w in ws] hws = [_stack_weight([w[1]]) for w in ws] xbs = [_stack_weight([b[0]]) for b in bs] hbs = [_stack_weight([b[1]]) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = ( linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward) ] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def fixed_length_n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, ): xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and cuda.cudnn_enabled and _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs, ))) rnn = FixedLengthNStepLSTMFunction(n_layers, states, train=train) ret = rnn(*inputs) hy, cy, ys = ret _, batch_size, dim = hy.shape ys_reshape = F.reshape(ys, (-1, batch_size, dim)) # (length, batch, dim) return hy, cy, ys_reshape else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio) h = dropout.dropout(h, ratio=dropout_ratio) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) #return hy, cy, tuple(ys) ys_concat = F.concat(ys, axis=0) ys_reshape = F.reshape( ys_concat, (-1, ys[0].shape[0], ys[0].shape[1])) # (length, batch, dim) return hy, cy, ys_reshape
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepBiRNNReLU(n_layers, states) else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepRNNReLU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [_stack_weight([w[0]]) for w in ws] hws = [_stack_weight([w[1]]) for w in ws] xbs = [_stack_weight([b[0]]) for b in bs] hbs = [_stack_weight([b[1]]) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = (linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def _batch_triangular_inv(x, lower=True): n = len(x) y = [] for i in range(n): y.append(_triangular_inv(x[i])) return stack.stack(y)
def backward(self, indexes, grad_outputs): grad_outputs = [ self._xp.zeros(self._shape, dtype=self._dtype) if g is None else g for g in grad_outputs] return stack.stack(grad_outputs, self.axis),
def n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t &= \\sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) \\\\ f_t &= \\sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) \\\\ o_t &= \\sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) \\\\ a_t &= \\tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) \\\\ c_t &= f_t \\dot c_{t-1} + i_t \\dot a_t \\\\ h_t &= o_t \\dot \\tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)