def _gru(x, h, w, b, with_bias): """GRU cell. Args: x (:obj:`~nnabla.Variable`): Input data. h (:obj:`~nnabla.Variable`): Hidden state. w (:obj:`~nnabla.Variable`): Weight. b (:obj:`~nnabla.Variable`): Bias. with_bias (bool): Include the bias or not. """ hidden_size = h.shape[1] xh = F.concatenate(*(x, h), axis=1) w0, w1, w2 = F.split(w, axis=0) b0 = b1 = b2 = b3 = None if with_bias: b0, b1, b2, b3 = F.split(b, axis=0) r_t = F.sigmoid(F.affine(xh, F.transpose(w0, (1, 0)), b0)) z_t = F.sigmoid(F.affine(xh, F.transpose(w1, (1, 0)), b1)) w2_0 = w2[:, :w2.shape[1] - hidden_size] w2_1 = w2[:, w2.shape[1] - hidden_size:] n_t = F.tanh( F.affine(x, F.transpose(w2_0, (1, 0)), b2) + r_t * F.affine(h, F.transpose(w2_1, (1, 0)), b3)) h_t = (1 - z_t) * n_t + z_t * h return h_t
def simple_rnn(inputs: nn.Variable, units: int, mask: Optional[nn.Variable] = None, return_sequences: bool = False, fix_parameters=False) -> nn.Variable: ''' A vanilla recurrent neural network layer Args: inputs (nnabla.Variable): A shape of [batch_size, length, embedding_size]. units (int): Dimensionality of the output space. mask (nnabla.Variable): A shape of [batch_size, length, 1]. return_sequences (bool): Whether to return the last output. in the output sequence, or the full sequence. fix_parameters (bool): Fix parameters (Set need_grad=False). Returns: nn.Variable: A shape [batch_size, length, units] or nn.Variable: A shape [batch_size units]. ''' hs = [] batch_size, length, embedding_size = inputs.shape h0 = F.constant(0, shape=(batch_size, units)) h = h0 if mask is None: mask = F.constant(1, shape=(batch_size, length, 1)) for x, cond in zip(F.split(inputs, axis=1), F.split(mask, axis=1)): h_t = F.tanh(PF.affine(F.concatenate(x, h, axis=1), units, fix_parameters=fix_parameters)) h = where(cond, h_t, h) hs.append(h) if return_sequences: hs = F.stack(*hs, axis=1) return hs else: return hs[-1]
def test_graph_more_than_2_outputs(seed, clear_buffer): count = 0 def func_hook(f): nonlocal count if f.name == 'Split': count += 1 nn.clear_parameters() a = nn.Variable.from_numpy_array(np.ones((10, ))) b = nn.Variable.from_numpy_array(np.ones((10, ))) c = F.add2(a, b, inplace=True, outputs=[a.data]) y = F.split(c, axis=0) nn.forward_all(y, function_pre_hook=func_hook) assert count == 1 res = [x.d for x in y] assert_allclose(res, [2.0] * 10) a = nn.Variable.from_numpy_array(np.ones((10, ))) b = nn.Variable.from_numpy_array(np.ones((10, ))) c = F.add2(a, b, inplace=True, outputs=[a.data]) y = F.split(c, axis=0) for yy in y: yy.forward() res = [x.d for x in y] assert_allclose(res, [11.0] * 10)
def get_loss(l1, l2, x, t, w_init, b_init, num_words, batch_size, state_size, dropout=False, dropout_rate=0.5, embed_name='embed', pred_name='pred'): e_list = [ PF.embed(x_elm, num_words, state_size, name=embed_name) for x_elm in F.split(x, axis=1) ] t_list = F.split(t, axis=1) loss = 0 for i, (e_t, t_t) in enumerate(zip(e_list, t_list)): if dropout: h1 = l1(F.dropout(e_t, dropout_rate), w_init, b_init) h2 = l2(F.dropout(h1, dropout_rate), w_init, b_init) y = PF.affine(F.dropout(h2, dropout_rate), num_words, name=pred_name) else: h1 = l1(e_t, w_init, b_init) h2 = l2(h1, w_init, b_init) y = PF.affine(h2, num_words, name=pred_name) t_t = F.reshape(t_t, [batch_size, 1]) loss += F.mean(F.softmax_cross_entropy(y, t_t)) loss /= float(i + 1) return loss
def lstm(inputs: nn.Variable, units: int, mask: Optional[nn.Variable] = None, initial_state: Tuple[nn.Variable, nn.Variable] = None, return_sequences: bool = False, return_state: bool = False, fix_parameters: bool = False) -> nn.Variable: ''' A long short-term memory Args: inputs (nnabla.Variable): A shape of [batch_size, length, embedding_size]. units (int): Dimensionality of the output space. mask (nnabla.Variable): A shape of [batch_size, length]. initial_state ([nnabla.Variable, nnabla.Variable]): A tuple of an initial cell and an initial hidden state. return_sequences (bool): Whether to return the last output. in the output sequence, or the full sequence. return_state (bool): Whether to return the last state which is consist of the cell and the hidden state. fix_parameters (bool): Fix parameters (Set need_grad=False). Returns: nn.Variable: A shape [batch_size, length, units]. or nn.Variable: A shape [batch_size units] ''' batch_size, length, embedding_size = inputs.shape if initial_state is None: c0 = F.constant(0, shape=(batch_size, units)) h0 = F.constant(0, shape=(batch_size, units)) else: assert type(initial_state) is tuple or type(initial_state) is list, \ 'initial_state must be a typle or a list.' assert len(initial_state) == 2, \ 'initial_state must have only two states.' c0, h0 = initial_state assert c0.shape == h0.shape, 'shapes of initial_state must be same.' assert c0.shape[0] == batch_size, \ 'batch size of initial_state ({0}) is different from that of inputs ({1}).'.format(c0.shape[0], batch_size) assert c0.shape[1] == units, \ 'units size of initial_state ({0}) is different from that of units of args ({1}).'.format(c0.shape[1], units) cell = c0 hidden = h0 hs = [] if mask is None: mask = F.constant(1, shape=(batch_size, length, 1)) for x, cond in zip(F.split(inputs, axis=1), F.split(mask, axis=1)): cell_t, hidden_t = lstm_cell(x, cell, hidden) cell = where(cond, cell_t, cell) hidden = where(cond, hidden_t, hidden) hs.append(hidden) if return_sequences: ret = F.stack(*hs, axis=1) else: ret = hs[-1] if return_state: return ret, cell, hidden else: return ret
def top_k_error(target_action, target_action_type, target_action_mask, rule_prob, terminal_gen_action_prob, token_prob, copy_prob, k=5): batch_size, max_action_length, _ = target_action.shape _, _, rule_num = rule_prob.shape _, _, token_num = token_prob.shape _, _, max_query_length = copy_prob.shape # (batch_size, max_action_length) rule_mask, token_mask, copy_mask = F.split(target_action_type, axis=2) # (batch_size, max_action_length) target_rule, target_token, target_copy = F.split(target_action, axis=2) target_rule = F.reshape(target_rule, (batch_size, max_action_length, 1)) # (batch_size, max_action_length) gen_token_prob, copy_token_prob = F.split(terminal_gen_action_prob, axis=2) gen_token_prob = F.reshape(gen_token_prob, (batch_size, max_action_length, 1)) gen_token_prob = F.broadcast(gen_token_prob, (batch_size, max_action_length, token_num)) copy_token_prob = F.reshape(copy_token_prob, (batch_size, max_action_length, 1)) copy_token_prob = F.broadcast( copy_token_prob, (batch_size, max_action_length, max_query_length)) # (batch_size, max_action_length, token_num) token_prob = gen_token_prob * token_prob # (batch_size, max_action_length, max_query_length) copy_prob = copy_token_prob * copy_prob # (batch_size, max_action_length, token_num + max_query_length) gen_or_copy = F.concatenate(token_prob, copy_prob, axis=2) # (batch_size, max_action_length) token_label = token_mask * target_token + (copy_mask * (target_copy + token_num)) token_label = F.reshape(token_label, (batch_size, max_action_length, 1)) # (batch_size, max_action_length, 1) rule_err = F.top_n_error(rule_prob, target_rule, axis=2, n=k) rule_err = F.reshape(rule_err, (batch_size, max_action_length)) # (batch_size, max_action_length, 1) token_err = F.top_n_error(gen_or_copy, token_label, axis=2, n=k) token_err = F.reshape(token_err, (batch_size, max_action_length)) # (batch_size, max_action_length) err = rule_mask * rule_err + (token_mask + copy_mask) * token_err # (batch_size,) num = F.sum(rule_mask, axis=1) + F.sum(token_mask, axis=1) + F.sum( copy_mask, axis=1) # (batch_size,) err = F.sum(err, axis=1) # (batch_size,) err = err / (num + 1e-7) return F.mean(err)
def TimeDistributedSoftmaxCrossEntropy(y_pred, y_true): ''' A time distributed softmax crossentropy Args: y_pred (nnabla.Variable): A shape of [B, SentenceLength, O]. # one-hot y_true (nnabla.Variable): A shape of [B, SentenceLength, 1]. # index Returns: nn.Variable: A shape [B, SentenceLength]. ''' ret = [] for y_p, y_t in zip(F.split(y_pred, axis=1), F.split(y_true, axis=1)): ret.append(F.softmax_cross_entropy(y_p, y_t)) return F.concatenate(*ret)
def time_distributed_softmax_cross_entropy(y_pred: nn.Variable, y_true: nn.Variable) -> nn.Variable: ''' A time distributed softmax crossentropy Args: y_pred (nnabla.Variable): A shape of (batch_size, length, number_of_outputs). # one-hot y_true (nnabla.Variable): A shape of (batch_size, length, 1). # index Returns: nn.Variable: A shape (batch_size, length). ''' ret = [] for y_p, y_t in zip(F.split(y_pred, axis=1), F.split(y_true, axis=1)): ret.append(F.softmax_cross_entropy(y_p, y_t)) return F.concatenate(*ret)
def lab2rgb(input): input_trans = F.split(input, axis=1) L, a, b = F.split(input, axis=1) y = (L + 16.0) / 116.0 x = (a / 500.0) + y z = y - (b / 200.0) neg_mask = F.less_scalar(z, 0).apply(need_grad=False) z = z * F.logical_not(neg_mask) mask_Y = F.greater_scalar(y, 0.2068966).apply(need_grad=False) mask_X = F.greater_scalar(x, 0.2068966).apply(need_grad=False) mask_Z = F.greater_scalar(z, 0.2068966).apply(need_grad=False) Y_1 = (y ** 3) * mask_Y Y_2 = L / (116. * 7.787) * F.logical_not(mask_Y) var_Y = Y_1 + Y_2 X_1 = (x ** 3) * mask_X X_2 = (x - 16. / 116.) / 7.787 * F.logical_not(mask_X) var_X = X_1 + X_2 Z_1 = (z ** 3) * mask_Z Z_2 = (z - 16. / 116.) / 7.787 * F.logical_not(mask_Z) var_Z = Z_1 + Z_2 X = 0.95047 * var_X Y = 1.00000 * var_Y Z = 1.08883 * var_Z var_R = X * 3.2406 + Y * -1.5372 + Z * -0.4986 var_G = X * -0.9689 + Y * 1.8758 + Z * 0.0415 var_B = X * 0.0557 + Y * -0.2040 + Z * 1.0570 mask_R = F.greater_scalar(var_R, 0.0031308).apply(need_grad=False) n_mask_R = F.logical_not(mask_R) R_1 = (1.055 * (F.maximum2(var_R, n_mask_R) ** (1 / 2.4)) - 0.055) * mask_R R_2 = (12.92 * var_R) * n_mask_R var_R = R_1 + R_2 mask_G = F.greater_scalar(var_G, 0.0031308).apply(need_grad=False) n_mask_G = F.logical_not(mask_G) G_1 = (1.055 * (F.maximum2(var_G, n_mask_G) ** (1 / 2.4)) - 0.055) * mask_G G_2 = (12.92 * var_G) * n_mask_G var_G = G_1 + G_2 mask_B = F.greater_scalar(var_B, 0.0031308).apply(need_grad=False) n_mask_B = F.logical_not(mask_B) B_1 = (1.055 * (F.maximum2(var_B, n_mask_B) ** (1 / 2.4)) - 0.055) * mask_B B_2 = (12.92 * var_B) * n_mask_B var_B = B_1 + B_2 return F.stack(var_R, var_G, var_B, axis=1)
def lstm(x, h, c, w, b, with_bias): hidden_size = h.shape[1] xh = F.concatenate(*(x, h), axis=1) w0, w1, w2, w3 = F.split(w, axis=0) b0 = b1 = b2 = b3 = None if with_bias: b0, b1, b2, b3 = F.split(b, axis=0) i_t = F.affine(xh, F.transpose(w0, (1, 0)), b0) f_t = F.affine(xh, F.transpose(w1, (1, 0)), b1) g_t = F.affine(xh, F.transpose(w2, (1, 0)), b2) o_t = F.affine(xh, F.transpose(w3, (1, 0)), b3) c_t = F.sigmoid(f_t) * c + F.sigmoid(i_t) * F.tanh(g_t) h_t = F.sigmoid(o_t) * F.tanh(c_t) return h_t, c_t
def loss(target_action, target_action_type, target_action_mask, rule_prob, terminal_gen_action_prob, token_prob, copy_prob): batch_size, max_action_length, _ = target_action.shape _, _, rule_num = rule_prob.shape _, _, token_num = token_prob.shape _, _, max_query_length = copy_prob.shape # (batch_size, max_action_length) target_rule, target_token, target_copy = F.split(target_action, axis=2) target_rule = F.reshape(target_rule, (batch_size, max_action_length, 1)) target_rule = F.one_hot( target_rule, (rule_num, )) # (batch_size, max_action_length, rule_num) rule_tgt_prob = rule_prob * target_rule # (batch_size, max_action_length, rule_num) rule_tgt_prob = F.sum(rule_tgt_prob, axis=2) # (batch_size, max_action_length) target_token = F.reshape(target_token, (batch_size, max_action_length, 1)) target_token = F.one_hot( target_token, (token_num, )) # (batch_size, max_action_length, token_num) token_tgt_prob = token_prob * target_token # (batch_size, max_action_length, token_num) token_tgt_prob = F.sum(token_tgt_prob, axis=2) # (batch_size, max_action_length) target_copy = F.reshape(target_copy, (batch_size, max_action_length, 1)) target_copy = F.one_hot( target_copy, (max_query_length, )) # (batch_size, max_action_length, max_query_lenght) copy_tgt_prob = copy_prob * target_copy # (batch_size, max_action_length, max_query_length) copy_tgt_prob = F.sum(copy_tgt_prob, axis=2) # (batch_size, max_action_length) # (batch_size, max_action_length) gen_token_prob, copy_token_prob = F.split(terminal_gen_action_prob, axis=2) # (batch_size, max_action_length) rule_mask, token_mask, copy_mask = F.split(target_action_type, axis=2) # (batch_size, max_action_length) target_prob = rule_mask * rule_tgt_prob + \ token_mask * gen_token_prob * token_tgt_prob + \ copy_mask * copy_token_prob * copy_tgt_prob # (batch_size, max_action_length) likelihood = F.log(target_prob + 1e-7) loss = -likelihood * target_action_mask # (batch_size) loss = F.sum(loss, axis=1) return F.mean(loss)
def simple_rnn(inputs, units, return_sequences=False, fix_parameters=False): ''' A vanilla recurrent neural network layer Args: inputs (nnabla.Variable): A shape of [B, SentenceLength, EmbeddingSize]. units (int): Dimensionality of the output space. return_sequences (bool): Whether to return the last output. in the output sequence, or the full sequence. fix_parameters (bool): Fix parameters (Set need_grad=False). Returns: nn.Variable: A shape [B, SentenceLength, units]. or nn.Variable: A shape [B, units] ''' hs = [] batch_size = inputs.shape[0] sentence_length = inputs.shape[1] h0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units))) inputs = F.split(inputs, axis=1) # split in the direction of sequence h = h0 for x in inputs: h = F.tanh(PF.affine(F.concatenate(x, h, axis=1), units, fix_parameters=fix_parameters)) hs.append(h) if return_sequences: hs = F.stack(*hs, axis=1) return hs else: return hs[-1]
def yolov2_image_coordinate(t_xy, t_wh, biases): import numpy as np from nnabla.parameter import pop_parameter, set_parameter h, w = t_xy.shape[-2:] xs = pop_parameter('xs') ys = pop_parameter('ys') if xs is None or (h != xs.shape[-1]): xs = nn.Variable.from_numpy_array(np.arange(w).reshape(1, 1, 1, -1)) xs.need_grad = False set_parameter('xs', xs) if ys is None or (h != ys.shape[-2]): ys = nn.Variable.from_numpy_array(np.arange(h).reshape(1, 1, -1, 1)) ys.need_grad = False set_parameter('ys', ys) t_x, t_y = F.split(t_xy, axis=2) oshape = list(t_x.shape) oshape.insert(2, 1) t_x = F.reshape((t_x + xs) / w, oshape) t_y = F.reshape((t_y + ys) / h, oshape) pop_parameter('biases') biases = biases.reshape(1, biases.shape[0], biases.shape[1], 1, 1) / np.array([w, h]).reshape(1, 1, 2, 1, 1) b = nn.Variable.from_numpy_array(biases) b.need_grad = False set_parameter('biases', b) t_wh = t_wh * b return t_x, t_y, t_wh
def make_symmetric_matrix(_x): # input # _x : type=nn.Variable(), _x.shape=(batch_size, *, *, *) # output # j_vector : type=nn.Variable(), j_vector.shape=(batch_size, batch_size - 1, *, *, *) batch_size = _x.shape[0] var_list = F.split(_x) concat_list = [] # --- split & gather components --- for i in range(batch_size): tmp_list = [] for j in range(batch_size): if i != j: tmp_list.append( F.reshape(var_list[j], [ 1, ] + list(var_list[j].shape))) if len(tmp_list) > 1: concat_var = F.concatenate(*tmp_list, axis=0) else: concat_var = tmp_list[0] concat_list.append( F.reshape(concat_var, [ 1, ] + list(concat_var.shape))) # --- concatenate --- j_vector = F.concatenate(*concat_list, axis=0) return j_vector
def create_fixed_length_rnn(xs0, h0, w0, w, b, num_layers, nonlinearity, num_directions, with_bias): # xs : [T, B, I] # h0 : [L, D, B, H] # c0 : [L, D, B, H] # w0 : [D, H, I+H] # w : [L-1, D, H, D * H + H] # b : [L, D, H] batch_size = xs0.shape[1] hidden_size = h0.shape[3] if xs0.shape[0] == 1: xs = [xs0[0]] else: xs = F.split(xs0, axis=0) hn = [] for i in range(num_layers): wi = w0 if i > 0: wi = w[i - 1] # wi : [D, H, ?] # Forward direction hif = h0[i, 0] # [B, H] wif = wi[0] bif = None if with_bias: bif = b[i, 0] hs = [] for j, x in enumerate(xs): # x : [B, I] hif = rnn(x, hif, wif, bif, nonlinearity, with_bias) hs.append(hif) hn.append(hif) if num_directions == 1: xs = hs continue # Backward direction hib = h0[i, 1] # [B, H] wib = wi[1] bib = None if with_bias: bib = b[i, 1] for k, x, in enumerate(reversed(xs)): j = len(xs) - 1 - k # x : [B, I] hib = rnn(x, hib, wib, bib, nonlinearity, with_bias) hs[j] = F.concatenate(hs[j], hib, axis=1) hn.append(hib) xs = hs ys = xs # list of [B, HD] ys = F.stack(*ys, axis=0) # [T, B, HD] hn = F.reshape(F.stack(*hn, axis=0), (num_layers, num_directions, batch_size, hidden_size)) # LD list of [B, H] --> [L, D, B, H] return ys, hn
def gru(x, h, w, b, with_bias): hidden_size = h.shape[1] xh = F.concatenate(*(x, h), axis=1) w0, w1, w2 = F.split(w, axis=0) b0 = b1 = b2 = b3 = None if with_bias: b0, b1, b2, b3 = F.split(b, axis=0) r_t = F.sigmoid(F.affine(xh, F.transpose(w0, (1, 0)), b0)) z_t = F.sigmoid(F.affine(xh, F.transpose(w1, (1, 0)), b1)) w2_0 = w2[:, :w2.shape[1]-hidden_size] w2_1 = w2[:, w2.shape[1]-hidden_size:] n_t = F.tanh(F.affine(x, F.transpose(w2_0, (1, 0)), b2) + r_t*F.affine(h, F.transpose(w2_1, (1, 0)), b3)) h_t = (1-z_t)*n_t + z_t*h return h_t
def split(x, axis=0): if x.shape[axis] == 1: s = list(x.shape) s.pop(axis) x = F.broadcast(x, x.shape) return [F.reshape(x, s)] else: return F.split(x, axis=axis)
def time_distributed_func(x, *args, **kwargs): ret = [] batch_size = x.shape[0] for x_ in F.split(x, axis=1): value = func(x_, *args, **kwargs) _, output_dim = value.shape ret.append(F.reshape(value, (batch_size, 1, output_dim))) return F.concatenate(*ret, axis=1)
def multihead_attention(query: nn.Variable, key: nn.Variable, value: nn.Variable, h: int, mask=None, train: bool = True, dropout_ratio: float = 0.1): batch_size, sentence_length_query, embedding_size = query.shape batch_size, sentence_length_memory, embedding_size = key.shape assert embedding_size % h == 0 q = query k = key v = value dim = embedding_size // h with nn.parameter_scope('q_dense'): q = time_distributed(PF.affine)(q, embedding_size) with nn.parameter_scope('k_dense'): k = time_distributed(PF.affine)(k, embedding_size) with nn.parameter_scope('v_dense'): v = time_distributed(PF.affine)(v, embedding_size) q = F.reshape(q, shape=(batch_size, h, sentence_length_query, dim)) k = F.reshape(k, shape=(batch_size, h, sentence_length_memory, dim)) v = F.reshape(v, shape=(batch_size, h, sentence_length_memory, dim)) ret = [] # for h times for _q, _k, _v in zip(F.split(q, axis=1), F.split(k, axis=1), F.split(v, axis=1)): ret.append( attention(_q, _k, _v, mask=mask, train=train, dropout_ratio=dropout_ratio)) x = F.concatenate(*ret, axis=2) with nn.parameter_scope('concat_dense'): x = time_distributed(PF.affine)(x, embedding_size) return x
def guided_filter(img, r, eps): """ Edge preserving filter """ img2 = F.concatenate(img, img * img, axis=3) img2 = box_filter(img2, r) mean = F.split(img2, axis=3) mean_i = F.stack(mean[0], mean[1], mean[2], axis=3) mean_ii = F.stack(mean[3], mean[4], mean[5], axis=3) var_i = mean_ii - mean_i * mean_i a = var_i / (var_i + eps) b = mean_i - a * mean_i ab = F.concatenate(a, b, axis=3) ab = box_filter(ab, r) mean_ab = F.split(ab, axis=3) mean_a = F.stack(mean_ab[0], mean_ab[1], mean_ab[2], axis=3) mean_b = F.stack(mean_ab[3], mean_ab[4], mean_ab[5], axis=3) q = mean_a * img + mean_b return q
def lstm(inputs, units, initial_state=None, return_sequences=False, return_state=False, fix_parameters=False): ''' A long short-term memory Args: inputs (nnabla.Variable): A shape of [B, SentenceLength, EmbeddingSize]. units (int): Dimensionality of the output space. initial_state ([nnabla.Variable, nnabla.Variable]): A tuple of an initial cell and an initial hidden state. return_sequences (bool): Whether to return the last output. in the output sequence, or the full sequence. return_state (bool): Whether to return the last state which is consist of the cell and the hidden state. fix_parameters (bool): Fix parameters (Set need_grad=False). Returns: nn.Variable: A shape [B, SentenceLength, units]. or nn.Variable: A shape [B, units] ''' batch_size = inputs.shape[0] if initial_state is None: c0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units))) h0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units))) else: assert type(initial_state) is tuple or type(initial_state) is list, \ 'initial_state must be a typle or a list.' assert len(initial_state) == 2, \ 'initial_state must have only two states.' c0, h0 = initial_state assert c0.shape == h0.shape, 'shapes of initial_state must be same.' assert c0.shape[0] == batch_size, \ 'batch size of initial_state ({0}) is different from that of inputs ({1}).'.format(c0.shape[0], batch_size) assert c0.shape[1] == units, \ 'units size of initial_state ({0}) is different from that of units of args ({1}).'.format(c0.shape[1], units) cell = c0 hidden = h0 hs = [] for x in F.split(inputs, axis=1): cell, hidden = lstm_cell(x, cell, hidden) hs.append(hidden) if return_sequences: ret = F.stack(*hs, axis=1) else: ret = hs[-1] if return_state: return ret, cell, hidden else: return ret
def network(self, x_in, name='LSTM', n_hidden=32): hlist = [] for x_i in F.split(x_in, axis=1): self._h, self._c = self._lstm_cell(name, n_hidden, x_i, self._h, self._c) with nn.parameter_scope(name + '_Affine_2'): self._h = PF.affine(self._h, (self._cols_size,)) hlist.append(self._h) h = F.stack(*hlist, axis=1) h = F.slice(h, start=[0, h.shape[1]-self._x_output_length, 0], stop=[self._batch_size, h.shape[1], self._cols_size], step=[1, 1, 1]) return h
def LSTM(inputs, units, return_sequences=False, name='lstm'): ''' A long short-term memory layer Args: inputs (nnabla.Variable): A shape of [B, SentenceLength, EmbeddingSize]. units (int): Dimensionality of the output space. return_sequences (bool): Whether to return the last output. in the output sequence, or the full sequence. Returns: nn.Variable: A shape [B, SentenceLength, units]. or nn.Variable: A shape [B, units] ''' hs = [] batch_size = inputs.shape[0] sentence_length = inputs.shape[1] c0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units))) h0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units))) inputs = F.split(inputs, axis=1) cell = c0 hidden = h0 with nn.parameter_scope(name): for x in inputs: a = F.tanh( PF.affine(x, units, with_bias=False, name='Wa') + PF.affine(hidden, units, name='Ra')) input_gate = F.sigmoid( PF.affine(x, units, with_bias=False, name='Wi') + PF.affine(hidden, units, name='Ri')) forgate_gate = F.sigmoid( PF.affine(x, units, with_bias=False, name='Wf') + PF.affine(hidden, units, name='Rf')) cell = input_gate * a + forgate_gate * cell output_gate = F.sigmoid( PF.affine(x, units, with_bias=False, name='Wo') + PF.affine(hidden, units, name='Ro')) hidden = output_gate * F.tanh(cell) if return_sequences: hidden = F.reshape(hidden, (batch_size, 1, units)) hs.append(hidden) if return_sequences: hs = F.concatenate(*hs, axis=1) hs = F.reshape(hs, (batch_size, sentence_length, units)) return hs else: return hs[-1]
def conv_bn_relu(h, i, name, skip=True): s = h imaps = h.shape[1] with nn.parameter_scope(name): h = PF.convolution(h, imaps, (3, 3), pad=(1, 1)) h = PF.batch_normalization(h) h = F.relu(h) if not skip: return F.concatenate(*[h, s], axis=1) if i % 2 == 0 else h + s h = F.split(h, axis=1) h = [h_.reshape(h_.shape[:1] + (1, ) + h_.shape[1:]) for h_ in h] h = F.concatenate(*h, axis=1) return h
def _lstm(x, h, c, w, b, with_bias): """LSTM cell. Args: x (:obj:`~nnabla.Variable`): Input data. h (:obj:`~nnabla.Variable`): Short-term state. c (:obj:`~nnabla.Variable`): Long-term state. w (:obj:`~nnabla.Variable`): Weight. b (:obj:`~nnabla.Variable`): Bias. with_bias (bool): Include the bias or not. """ hidden_size = h.shape[1] xh = F.concatenate(*(x, h), axis=1) w0, w1, w2, w3 = F.split(w, axis=0) b0 = b1 = b2 = b3 = None if with_bias: b0, b1, b2, b3 = F.split(b, axis=0) i_t = F.affine(xh, F.transpose(w0, (1, 0)), b0) f_t = F.affine(xh, F.transpose(w1, (1, 0)), b1) g_t = F.affine(xh, F.transpose(w2, (1, 0)), b2) o_t = F.affine(xh, F.transpose(w3, (1, 0)), b3) c_t = F.sigmoid(f_t) * c + F.sigmoid(i_t) * F.tanh(g_t) h_t = F.sigmoid(o_t) * F.tanh(c_t) return h_t, c_t
def LSTM(inputs, units, initial_state=None, return_sequences=False, return_state=False, name='lstm'): batch_size = inputs.shape[0] if initial_state is None: c0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units)), need_grad=True) h0 = nn.Variable.from_numpy_array(np.zeros((batch_size, units)), need_grad=True) else: assert type(initial_state) is tuple or type(initial_state) is list, \ 'initial_state must be a typle or a list.' assert len(initial_state) == 2, \ 'initial_state must have only two states.' c0, h0 = initial_state assert c0.shape == h0.shape, 'shapes of initial_state must be same.' assert c0.shape[0] == batch_size, \ 'batch size of initial_state ({0}) is different from that of inputs ({1}).'.format(c0.shape[0], batch_size) assert c0.shape[1] == units, \ 'units size of initial_state ({0}) is different from that of units of args ({1}).'.format(c0.shape[1], units) cell = c0 hidden = h0 hs = [] for x in F.split(inputs, axis=1): with nn.parameter_scope(name): cell, hidden = LSTMCell(x, cell, hidden) hs.append(hidden) if return_sequences: ret = F.stack(*hs, axis=1) else: ret = hs[-1] if return_state: return ret, cell, hidden else: return ret
def stack_backward(inputs, axis=0): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] yshape = dy.shape if yshape[axis] == 1: reshape = yshape[:axis] + yshape[axis + 1:] return F.reshape(dy, reshape, inplace=False) dx_list = F.split(dy, axis=axis) return dx_list
def create_network(batchsize, imheight, imwidth, args): import gc gc.collect() nnabla_ext.cuda.clear_memory_cache() anchors = args.num_anchors classes = args.num_classes yolo_x = nn.Variable((batchsize, 3, imheight, imwidth)) yolo_features = yolov2.yolov2(yolo_x, anchors, classes, test=False) nB = yolo_features.shape[0] nA = args.num_anchors nC = args.num_classes nH = yolo_features.shape[2] nW = yolo_features.shape[3] output = yolo_features.get_unlinked_variable(need_grad=True) # TODO: Workaround until v1.0.2. # Explicitly enable grad since need_grad option above didn't work. output.need_grad = True output = F.reshape(output, (nB, nA, (5 + nC), nH, nW)) output_splitted = F.split(output, 2) x, y, w, h, conf = [v.reshape((nB, nA, nH, nW)) for v in output_splitted[0:5]] x, y, conf = map(F.sigmoid, [x, y, conf]) cls = F.stack(*output_splitted[5:], axis=2) cls = cls.reshape((nB*nA, nC, nH*nW)) cls = F.transpose(cls, [0, 2, 1]).reshape((nB*nA*nH*nW, nC)) tx, ty, tw, th, tconf, coord_mask, conf_mask_sq = [ nn.Variable(v.shape) for v in [x, y, w, h, conf, x, conf]] cls_ones, cls_mask = [nn.Variable(cls.shape) for _ in range(2)] tcls, cls_mask_bb = [nn.Variable((cls.shape[0], 1)) for _ in range(2)] coord_mask_sq = F.pow_scalar(coord_mask, 2) loss_x = args.coord_scale * F.sum(F.squared_error(x, tx) * coord_mask_sq) loss_y = args.coord_scale * F.sum(F.squared_error(y, ty) * coord_mask_sq) loss_w = args.coord_scale * F.sum(F.squared_error(w, tw) * coord_mask_sq) loss_h = args.coord_scale * F.sum(F.squared_error(h, th) * coord_mask_sq) loss_conf = F.sum(F.squared_error(conf, tconf) * conf_mask_sq) loss_cls = args.class_scale * \ F.sum(cls_mask_bb * F.softmax_cross_entropy(cls + cls_ones - cls_mask, tcls)) loss_nnabla = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return yolo_x, yolo_features, (x, y, w, h, conf, cls), (tx, ty, tw, th, tconf, coord_mask, conf_mask_sq, cls_ones, cls_mask, tcls, cls_mask_bb), loss_nnabla
def time_distributed_func(x, *args, **kwargs): ret = [] batch_size = x.shape[0] length = x.shape[1] dim = x.shape[2] if x.ndim > 2 else 1 if length > 1: xs = F.split(x, axis=1) else: xs = [F.reshape(x, (batch_size, dim))] for x_ in xs: value = func(x_, *args, **kwargs) _, output_dim = value.shape ret.append(F.reshape(value, (batch_size, 1, output_dim))) if length > 1: return F.concatenate(*ret, axis=1) else: return ret[0]
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # Compute # w.r.t. dy_{0}, ..., dy_{N-1} g_dx = outputs[0].grad g_dy_list = list(F.split(g_dx, axis)) g_dy_list.reverse() for i in range(len(inputs[1:])): g_dy = inputs[i + 1].grad g_dy_ = g_dy_list[i] if prop_down[i + 1]: if accum[i + 1]: g_dy += g_dy_ else: g_dy.copy_from(g_dy_)