def functions(self): return collections.OrderedDict([ ('conv1_1', [self.conv1_1, relu]), ('conv1_2', [self.conv1_2, relu]), ('pool1', [_max_pooling_2d]), ('conv2_1', [self.conv2_1, relu]), ('conv2_2', [self.conv2_2, relu]), ('pool2', [_max_pooling_2d]), ('conv3_1', [self.conv3_1, relu]), ('conv3_2', [self.conv3_2, relu]), ('conv3_3', [self.conv3_3, relu]), ('conv3_4', [self.conv3_4, relu]), ('pool3', [_max_pooling_2d]), ('conv4_1', [self.conv4_1, relu]), ('conv4_2', [self.conv4_2, relu]), ('conv4_3', [self.conv4_3, relu]), ('conv4_4', [self.conv4_4, relu]), ('pool4', [_max_pooling_2d]), ('conv5_1', [self.conv5_1, relu]), ('conv5_2', [self.conv5_2, relu]), ('conv5_3', [self.conv5_3, relu]), ('conv5_4', [self.conv5_4, relu]), ('pool5', [_max_pooling_2d]), # ('fc6', [self.fc6, relu, dropout]), ('fc6', [self.fc6, relu, lambda x: dropout(x, ratio=0.0)]), # ('fc7', [self.fc7, relu, dropout]), ('fc7', [self.fc7, relu, lambda x: dropout(x, ratio=0.0)]), ('fc8', [self.fc8]), ('prob', [softmax]), ])
def __call__(self, x): h = x h = relu(self.conv1_1(h)) h = relu(self.conv1_2(h)) h = _max_pooling_2d(h) h = relu(self.conv2_1(h)) h = relu(self.conv2_2(h)) h = _max_pooling_2d(h) h = relu(self.conv3_1(h)) h = relu(self.conv3_2(h)) h = relu(self.conv3_3(h)) h = _max_pooling_2d(h) h = relu(self.conv4_1(h)) h = relu(self.conv4_2(h)) h = relu(self.conv4_3(h)) h = _max_pooling_2d(h) h = relu(self.conv5_1(h)) h = relu(self.conv5_2(h)) h = relu(self.bn1(self.conv5_3(h))) h = _max_pooling_2d(h) h = dropout(relu(self.bn2(self.fc6(h)))) h = dropout(relu(self.bn3(self.fc7(h)))) h = self.fc8(h) return h
def feed_lstm(self, word, embed_layer, lstm_layer_list, train): # get embedding for word embed_id = N.dropout(embed_layer(word), ratio=DROPOUT_RATIO, train=train) # feed into first LSTM layer hs = N.dropout(self[lstm_layer_list[0]](embed_id), ratio=DROPOUT_RATIO, train=train) # feed into remaining LSTM layers for lstm_layer in lstm_layer_list[1:]: hs = N.dropout(self[lstm_layer](hs), ratio=DROPOUT_RATIO, train=train)
def __call__(self, x, **kwargs): """Applies the lstm layer. Args: x (~chainer.Variable): Time-Batch of input vectors. Returns: ~chainer.Variable: Output of the lstm layer. """ dropout_rate = kwargs.get('dropout', 0.) dropout_rate_hidden_hidden = kwargs.get('dropout_hidden_hidden', 0.) x = dropout(x, dropout_rate) lstm_in = sequence_linear_function(x, self.W_x, self.b) if self.normalized: lstm_in = sequence_batch_normalization_function( lstm_in, self.gamma, self.beta) if self.stateful: c_prev = self.c_prev h_prev = self.h_prev else: c_prev = None h_prev = None lstm_out, self.h_prev, self.c_prev = \ sequence_lstm_function(lstm_in, self.W_h, c_prev, h_prev, self.reverse, dropout_rate_hidden_hidden) return lstm_out
def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = ( linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list
def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list
def __call__(self, x, **kwargs): """Applies the lstm layer. Args: x (~chainer.Variable): Time-Batch of input vectors. Returns: ~chainer.Variable: Output of the lstm layer. """ dropout_rate = kwargs.get('dropout', 0.) dropout_rate_hidden_hidden = kwargs.get('dropout_hidden_hidden', 0.) x = dropout(x, dropout_rate) lstm_in = sequence_linear_function(x, self.W_x, self.b) if self.normalized: lstm_in = sequence_batch_normalization_function(lstm_in, self.gamma, self.beta) if self.stateful: c_prev = self.c_prev h_prev = self.h_prev else: c_prev = None h_prev = None lstm_out, self.h_prev, self.c_prev = \ sequence_lstm_function(lstm_in, self.W_h, c_prev, h_prev, self.reverse, dropout_rate_hidden_hidden) return lstm_out
def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = (linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list
def __call__(self, frame, prev_word, state, dropout_flag, dropout_ratio): i1 = self.xi1(dropout(frame, dropout_ratio, dropout_flag)) c1, h1 = lstm(state['c1'], self.ih1(i1) + self.hh1(state['h1'])) i2 = self.xi2(prev_word) concat = array.concat.concat((i2, h1)) c2, h2 = lstm(state['c2'], self.ih2(concat) + self.hh2(state['h2'])) state = {'c1': c1, 'h1': h1, 'c2': c2, 'h2': h2} return state
def __call__(self, x): x = self.embed(x) xs = split_axis.split_axis(x, x.data.shape[1], 1) ret = [] for x in xs: for l in self.rnns: x = l(x) x = dropout.dropout(x, 0.25, self.train) for l in self.linears: x = l(x) x = reshape.reshape(x, x.data.shape + (-1, )) ret.append(x) ret = concat.concat(ret, axis=2) return ret
def ASPP(x): y = [ F.tile( self.ASPP[0](F.average_pooling_2d( x, ksize=x.shape[-2:])), x.shape[-2:]) ] y.extend([ self.ASPP[i](x) for i in range(1, len(self.ASPP) - 1) ]) y = F.concat(y, axis=1) y = dropout.dropout(y, ratio=0.5) y = self.ASPP[-1](y) return y
def __init__(self, pretrained_model='auto'): super(GoogLeNet, self).__init__( conv1=Convolution2D(3, 64, 7, stride=2, pad=3), conv2_reduce=Convolution2D(64, 64, 1), conv2=Convolution2D(64, 192, 3, stride=1, pad=1), inc3a=Inception(192, 64, 96, 128, 16, 32, 32), inc3b=Inception(256, 128, 128, 192, 32, 96, 64), inc4a=Inception(480, 192, 96, 208, 16, 48, 64), inc4b=Inception(512, 160, 112, 224, 24, 64, 64), inc4c=Inception(512, 128, 128, 256, 24, 64, 64), inc4d=Inception(512, 112, 144, 288, 32, 64, 64), inc4e=Inception(528, 256, 160, 320, 32, 128, 128), inc5a=Inception(832, 256, 160, 320, 32, 128, 128), inc5b=Inception(832, 384, 192, 384, 48, 128, 128), loss3_fc=Linear(1024, 1000), loss1_conv=Convolution2D(512, 128, 1), loss1_fc1=Linear(4 * 4 * 128, 1024), loss1_fc2=Linear(1024, 1000), loss2_conv=Convolution2D(528, 128, 1), loss2_fc1=Linear(4 * 4 * 128, 1024), loss2_fc2=Linear(1024, 1000), ) if pretrained_model == 'auto': _retrieve( 'bvlc_googlenet.npz', 'http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel', self) elif pretrained_model: npz.load_npz(pretrained_model, self) self.functions = OrderedDict([ ('conv1', [self.conv1, relu]), ('pool1', [ lambda x: max_pooling_2d(x, ksize=3, stride=2), lambda x: local_response_normalization(x, n=5) ]), ('conv2_reduce', [self.conv2_reduce, relu]), ('conv2', [self.conv2, relu]), ('pool2', [ lambda x: local_response_normalization(x, n=5), lambda x: max_pooling_2d(x, ksize=3, stride=2) ]), ('inc3a', [self.inc3a]), ('inc3b', [self.inc3b]), ('pool3', [lambda x: max_pooling_2d(x, ksize=3, stride=2)]), ('inc4a', [self.inc4a]), ('inc4b', [self.inc4b]), ('inc4c', [self.inc4c]), ('inc4d', [self.inc4d]), ('inc4e', [self.inc4e]), ('pool4', [lambda x: max_pooling_2d(x, ksize=3, stride=2)]), ('inc5a', [self.inc5a]), ('inc5b', [self.inc5b]), ('pool6', [lambda x: average_pooling_2d(x, ksize=7, stride=1)]), ('prob', [lambda x: dropout(x, ratio=0.4), self.loss3_fc]) ])
def __call__(self, x, **kwargs): """Applies the linear layer. Args: x (~chainer.Variable): Time-Batch of input vectors. Returns: ~chainer.Variable: Output of the linear layer. """ dropout_rate = kwargs.get('dropout', 0.) x = dropout(x, dropout_rate) x = sequence_linear_function(x, self.W, self.b) if self.normalized: x = sequence_batch_normalization_function(x, self.gamma, self.beta) return x
def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) counter = 0 for x in xs_list: counter += 1 batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio) if counter == 4: lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) else: lstm_in = linear.linear( x, xws[layer_idx], xbs[layer_idx]) + linear.linear( h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list
def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list
def classifier(x, train): x = A.average_pooling_2d(x, 8) x = dropout.dropout(x, train=train) x = self.linear(x) return x
def _dropout_sequence(xs, dropout_ratio): return [dropout.dropout(x, ratio=dropout_ratio) for x in xs]
def _dropout(x): return dropout(x, ratio=0.4)
def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = h0[layer_idx] # h:d_bar_s_1 # h_bar:d_s ''' print(len(xs_list)) print(len(xs_list[0])) print(len(xs_list[0][0])) ''' h_list = [] h_bar_list = [] c_s_list = [] z_s_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h phi_d = linear.linear(h_bar, W2, B2) ''' print(type(phi_ht), len(phi_ht)) print(type(phi_ht[0]), len(phi_ht[0])) print(type(phi_ht[0][0]), len(phi_ht[0][0])) print(type(phi_d), len(phi_d)) print(type(phi_d[0]), len(phi_d[0]), phi_d[0].shape) ''' #phi_ht_len = [t.shape[1] for t in phi_ht] #phi_ht_section = np.cumsum(phi_ht_len[:-1]) #concat_phi_ht = F.concat(phi_ht, axis=1) #concat_phi_d = [F.concat([phi_d[i]]*phi_ht_len[i], axis=0) for i in range(batch)] #concat_phi_d = F.concat(concat_phi_d, axis=0) #concat_phi_d = F.concat(F.transpose(phi_d), axis=0) u_st = list( map( lambda x, y: reshape.reshape((linear.linear( x, reshape.reshape(y, (1, len(y))))), (len(x), )), phi_ht, phi_d)) #(4) sum_u = list(map(F.sum, u_st)) alpha_st = list( map(lambda x, y: x / F.broadcast_to(y, x.shape), u_st, sum_u)) #(3) z_s = list(map(F.argmax, alpha_st)) z_s = list(map(lambda x: F.broadcast_to(x, (1, )), z_s)) z_s = F.concat(z_s, axis=0) ''' print(type(alpha_st),len(alpha_st)) print(type(alpha_st[0]),len(alpha_st[0])) print(alpha_st[0].shape) print(ht[0].shape) ''' c_s = list( map( lambda x, y: F.sum(F.broadcast_to( reshape.reshape(x, (x.shape[0], 1)), y.shape) * y, axis=0), alpha_st, ht)) #(2) c_s_2d = list( map(lambda x: reshape.reshape(x, (1, len(x))), c_s)) concat_c_s = F.concat(c_s_2d, axis=0) c_s = list( map(lambda x: F.broadcast_to(x, (1, len(x))), c_s)) c_s = F.concat(c_s, axis=0) ''' print(type(c_s), len(c_s)) print(type(c_s[0]), len(c_s[0]), c_s[0].shape) ''' h = F.relu( linear.linear(F.concat([concat_c_s, h_bar], axis=1), W3, B3)) h_list.append(h) h_bar_list.append(h_bar) c_s_list.append(c_s) z_s_list.append(z_s) #単語数の違いを担保 if h_rest is not None: h = concat.concat([h, h_rest], axis=0) h_bar = concat.concat([h_bar, h_rest], axis=0) return h_list, h_bar_list, c_s_list, z_s_list
def _dropout(x, train): return dropout(x, ratio=0.4, train=train)
def n_step_lstm(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t = \sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) f_t = \sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) o_t = \sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) a_t = \tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) c_t = f_t \dot c_{t-1} + i_t \dot a_t h_t = o_t \dot \tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:~chainer.Variable. Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t &= \\sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) \\\\ f_t &= \\sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) \\\\ o_t &= \\sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) \\\\ a_t &= \\tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) \\\\ c_t &= f_t \\dot c_{t-1} + i_t \\dot a_t \\\\ h_t &= o_t \\dot \\tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def fixed_length_n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, ): xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and cuda.cudnn_enabled and _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs, ))) rnn = FixedLengthNStepLSTMFunction(n_layers, states, train=train) ret = rnn(*inputs) hy, cy, ys = ret _, batch_size, dim = hy.shape ys_reshape = F.reshape(ys, (-1, batch_size, dim)) # (length, batch, dim) return hy, cy, ys_reshape else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio) h = dropout.dropout(h, ratio=dropout_ratio) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) #return hy, cy, tuple(ys) ys_concat = F.concat(ys, axis=0) ys_reshape = F.reshape( ys_concat, (-1, ys[0].shape[0], ys[0].shape[1])) # (length, batch, dim) return hy, cy, ys_reshape