def min_backward(inputs, axes=None, keep_dims=False, with_index=False, only_index=False): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] y0 = get_output(x0, "Min") if keep_dims: y0 = F.broadcast(y0, x0.shape) dy = F.broadcast(dy, x0.shape) else: axes = [i for i in range(x0.ndim)] if axes is None else force_list(axes) shape = [1 if i in axes else s for i, s in enumerate(x0.shape)] y0 = F.broadcast(F.reshape(y0, shape, inplace=False), x0.shape) dy = F.broadcast(F.reshape(dy, shape, inplace=False), x0.shape) m0 = F.equal(x0, y0) m0 = no_grad(m0) dx0 = dy * m0 if not with_index and not only_index: return dx0 elif with_index: return dx0, None elif only_index: return None
def transformer(train=True, droput_ratio=0.1): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) with nn.parameter_scope('embedding_layer'): # h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask h = token_embedding(x, vocab_size, embedding_size) h = position_encoding(h) if train: h = F.dropout(h, p=droput_ratio) for i in range(hopping_num): with nn.parameter_scope(f'encoder_hopping_{i}'): h = residual_normalization_wrapper(multihead_self_attention)( h, head_num, mask=mask, train=train, dropout_ratio=droput_ratio) h = residual_normalization_wrapper(positionwise_feed_forward)( h, train=train, dropout_ratio=droput_ratio) with nn.parameter_scope('output_layer'): y = F.sigmoid(PF.affine(h[:, 0, :], 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy(y, t)) return x, y, t, accuracy, loss
def build_self_attention_model(train=True): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant( np.finfo(np.float32).min, shape=mask.shape) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask with nn.parameter_scope('forward'): h_f = lstm(h, hidden_size, mask=mask, return_sequences=True, return_state=False) with nn.parameter_scope('backward'): h_b = lstm(h[:, ::-1, ], hidden_size, mask=mask, return_sequences=True, return_state=False)[:, ::-1, ] h = F.concatenate(h_f, h_b, axis=2) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('da'): a = F.tanh(time_distributed(PF.affine)(h, da)) if train: a = F.dropout(a, p=dropout_ratio) with nn.parameter_scope('r'): a = time_distributed(PF.affine)(a, r) if train: a = F.dropout(a, p=dropout_ratio) a = F.softmax(a + attention_mask, axis=1) m = F.batch_matmul(a, h, transpose_a=True) with nn.parameter_scope('output_mlp'): output = F.relu(PF.affine(m, output_mlp_size)) if train: output = F.dropout(output, p=dropout_ratio) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(output, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy( y, t)) + attention_penalty_coef * frobenius( F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r)) return x, t, accuracy, loss
def _nms(heat, kernel=3): pad = (kernel - 1) // 2 hmax = F.max_pooling(heat, (kernel, kernel), stride=(1, 1), pad=(pad, pad)) keep = F.equal(hmax, heat) return heat * keep
def global_average_pooling_1d(x, mask): count = F.sum(mask, axis=1) global_average_pooled = F.sum(h, axis=1) / count return global_average_pooled x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask h = global_average_pooling_1d(h, mask) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(h, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy(y, t)) # Create solver. solver = S.Adam() solver.set_parameters(nn.get_parameters()) trainer = Trainer(inputs=[x, t], loss=loss, metrics={ 'cross entropy': loss, 'accuracy': accuracy }, solver=solver) trainer.run(train_data_iter, dev_data_iter, epochs=5, verbose=1)