Esempio n. 1
0
 def _get_non_pad_mask(self, seq, pad_idx=None):
     if pad_idx:
         non_pad_mask = nd.not_equal(seq, pad_idx)
     else:
         non_pad_mask = nd.not_equal(seq, 0)
     non_pad_mask = nd.expand_dims(non_pad_mask, axis=2)
     return non_pad_mask
Esempio n. 2
0
def dev(ch_bert, model, ch_vocab, dev_dataiter, logger, ctx):
    TP_s = 0
    FP_s = 0
    FN_s = 0
    example_ids = []
    for content, token_types, valid_len, label, example_id in tqdm(
            dev_dataiter):
        example_ids.extend(example_id)
        content = content.as_in_context(ctx)
        token_types = token_types.as_in_context(ctx)
        valid_len = valid_len.as_in_context(ctx)
        label = label.as_in_context(ctx)

        output = model(content, token_types, valid_len)
        predict = nd.argmax(nd.softmax(output, axis=-1), axis=-1)
        label = label.as_in_context(ctx)
        tp_s = int(nd.sum(nd.equal(predict, label)).asscalar())
        fp_s = int(
            nd.sum(nd.not_equal(predict, label) *
                   nd.equal(label, 0)).asscalar())
        fn_s = int(
            nd.sum(nd.not_equal(predict, label) *
                   nd.equal(label, 1)).asscalar())
        TP_s += tp_s
        FP_s += fp_s
        FN_s += fn_s

    P_s = TP_s / (TP_s + FP_s)
    R_s = TP_s / (TP_s + FN_s)
    F = (2 * P_s * R_s) / (P_s + R_s)

    logger.info("F:{}".format(F))
    return F
Esempio n. 3
0
 def _get_key_mask(self, enc_idx, dec_idx, pad_idx=None):
     seq_len = dec_idx.shape[1]
     if pad_idx:
         pad_mask = nd.not_equal(enc_idx, pad_idx)
     else:
         pad_mask = nd.not_equal(enc_idx, 0)
     pad_mask = nd.expand_dims(pad_mask, axis=1)
     pad_mask = nd.broadcast_axes(pad_mask, axis=1, size=seq_len)
     return pad_mask
def eval(en_bert, mt_model, en_vocab, ch_vocab, dev_dataiter, logger, ctx):
    references = []
    hypothesis = []
    score = 0
    chencherry = SmoothingFunction()
    for trans, _, label, trans_valid_len, label_valid_len in tqdm(
            dev_dataiter):
        trans = trans.as_in_context(ctx)
        trans_valid_len = trans_valid_len.as_in_context(ctx)
        batch_size = trans.shape[0]

        trans_token_type = nd.zeros_like(trans)
        en_bert_outputs = en_bert(trans, trans_token_type, trans_valid_len)

        ch_sentences = [BOS]
        aim = ch_vocab[ch_sentences]
        aim = nd.array([aim], ctx=ctx)
        aim = nd.broadcast_axes(aim, axis=0, size=batch_size)

        for n in range(0, args.max_ch_len):
            mt_outputs = mt_model(en_bert_outputs, trans, aim)
            predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1)
            final_predict = predicts[:, -1:]
            aim = nd.concat(aim, final_predict, dim=1)

        label = label.asnumpy().tolist()
        predict_valid_len = nd.sum(nd.not_equal(
            predicts, ch_vocab(ch_vocab.padding_token)),
                                   axis=-1).asnumpy().tolist()
        predicts = aim[:, 1:].asnumpy().tolist()
        label_valid_len = label_valid_len.asnumpy().tolist()

        for refer, hypoth, l_v_len, p_v_len in zip(label, predicts,
                                                   label_valid_len,
                                                   predict_valid_len):
            l_v_len = int(l_v_len)
            p_v_len = int(p_v_len)
            refer = refer[:l_v_len]
            refer_str = [ch_vocab.idx_to_token[int(idx)] for idx in refer]
            hypoth_str = [ch_vocab.idx_to_token[int(idx)] for idx in hypoth]
            hypoth_str_valid = []
            for token in hypoth_str:
                if token == EOS:
                    hypoth_str_valid.append(token)
                    break
                hypoth_str_valid.append(token)
            references.append(refer_str)
            hypothesis.append(hypoth_str_valid)

    for refer, hypoth in zip(references, hypothesis):
        score += sentence_bleu([refer],
                               hypoth,
                               smoothing_function=chencherry.method1)
    logger.info("dev sample:")
    logger.info("refer :{}".format(" ".join(references[0]).replace(
        EOS, "[EOS]").replace(ch_vocab.padding_token, "")))
    logger.info("hypoth:{}".format(" ".join(hypothesis[0]).replace(
        EOS, "[EOS]")))
    return score / len(references)
Esempio n. 5
0
def getMask(q_seq, k_seq):
    # q_seq shape : (batch_size, q_seq_len)
    # k_seq shape : (batch_size, k_seq_len)
    q_len = q_seq.shape[1]
    pad_mask = nd.not_equal(k_seq, 0)
    pad_mask = nd.expand_dims(pad_mask, axis=1)
    pad_mask = nd.broadcast_axes(pad_mask, axis=1, size=q_len)

    return pad_mask
Esempio n. 6
0
def hard_example_mining(dist_mat, labels, return_inds=False):
    """For each anchor, find the hardest positive and negative sample.
    Args:
      dist_mat: pytorch Variable, pair wise distance between samples, shape [N, N]
      labels: pytorch LongTensor, with shape [N]
      return_inds: whether to return the indices. Save time if `False`(?)
    Returns:
      dist_ap: pytorch Variable, distance(anchor, positive); shape [N]
      dist_an: pytorch Variable, distance(anchor, negative); shape [N]
      p_inds: pytorch LongTensor, with shape [N];
        indices of selected hard positive samples; 0 <= p_inds[i] <= N - 1
      n_inds: pytorch LongTensor, with shape [N];
        indices of selected hard negative samples; 0 <= n_inds[i] <= N - 1
    NOTE: Only consider the case in which all labels have same num of samples,
      thus we can cope with all anchors in parallel.
    """

    assert len(dist_mat.shape) == 2
    assert dist_mat.shape[0] == dist_mat.shape[1]
    N = dist_mat.shape[0]

    # shape [N, N]
    is_pos = nd.equal(labels.broadcast_to((N, N)),
                      labels.broadcast_to((N, N)).T).astype('float32')
    is_neg = nd.not_equal(labels.broadcast_to((N, N)),
                          labels.broadcast_to((N, N)).T).astype('float32')
    # `dist_ap` means distance(anchor, positive)
    # both `dist_ap` and `relative_p_inds` with shape [N, 1]
    dist_pos = dist_mat * is_pos
    dist_ap = nd.max(dist_pos, axis=1)
    # `dist_an` means distance(anchor, negative)
    # both `dist_an` and `relative_n_inds` with shape [N, 1]
    dist_neg = dist_mat * is_neg + nd.max(dist_mat, axis=1,
                                          keepdims=True) * is_pos
    dist_an = nd.min(dist_neg, axis=1)
    # shape [N]

    # if return_inds:
    #     # shape [N, N]
    #     ind = (labels.new().resize_as_(labels)
    #            .copy_(torch.arange(0, N).long())
    #            .unsqueeze(0).expand(N, N))
    #     # shape [N, 1]
    #     p_inds = torch.gather(
    #         ind[is_pos].contiguous().view(N, -1), 1, relative_p_inds.data)
    #     n_inds = torch.gather(
    #         ind[is_neg].contiguous().view(N, -1), 1, relative_n_inds.data)
    #     # shape [N]
    #     p_inds = p_inds.squeeze(1)
    #     n_inds = n_inds.squeeze(1)
    #     return dist_ap, dist_an, p_inds, n_inds

    return dist_ap, dist_an
Esempio n. 7
0
def hard_example_mining(dist_mat, labels):
    assert len(dist_mat.shape) == 2
    assert dist_mat.shape[0] == dist_mat.shape[1]
    N = dist_mat.shape[0]

    # shape [N, N]
    is_pos = nd.equal(labels.broadcast_to((N, N)), labels.broadcast_to((N, N)).T).astype('float32')
    is_neg = nd.not_equal(labels.broadcast_to((N, N)), labels.broadcast_to((N, N)).T).astype('float32')

    dist_pos = dist_mat * is_pos
    dist_ap = nd.max(dist_pos, axis=1)

    dist_neg = dist_mat * is_neg + nd.max(dist_mat, axis=1, keepdims=True) * is_pos
    dist_an = nd.min(dist_neg, axis=1)

    return dist_ap, dist_an
Esempio n. 8
0
def evaluate(model, dataIterator, ctx, pad=None):
    """
    The Evaluation function
    :param model: model object
    :param dataIterator: data iterator in mxnet
    :param ctx: context
    :param weight: NDArray weight matrix of Weighted SCE
    :param pad: Int
    padding id
    :param report: Boolean
    F1 Score report Matrix
    :return:
    """
    loss = gloss.SoftmaxCrossEntropyLoss(sparse_label=False)
    if pad is not None:
        mask = True
    else:
        mask = False
    dataIterator.reset()
    total_loss = 0.0
    total_sample_num = 0
    y_pred, y_true = [], []
    for i, batch in enumerate(dataIterator):
        x = batch.data[0].as_in_context(ctx)
        y = batch.data[1].as_in_context(ctx)
        if mask:
            _mask = nd.not_equal(x, pad)
            pred = model(_mask)
        else:
            pred = model(x)
        bl = loss(pred, nd.one_hot(y,238)).as_in_context(ctx)
        total_sample_num = x.shape[0]
        total_loss += nd.sum(bl).asscalar()
    pred = nd.argmax(pred, axis=1)
    y_pred.extend(pred.asnumpy().tolist())
    y_true.extend(y.asnumpy().tolist())
    acc = metrics.accuracy_score(y_pred, y_true)
#     f1 = metrics.f1_score(y_pred, y_true, average='macro')
    avg_L = total_loss / float(total_sample_num)
#     if report:
#         return avg_L, acc, f1, metrics.classification_report(y_true, y_pred)
#     else:
#         return avg_L, acc, f1
    return avg_L, acc
Esempio n. 9
0
def batch_loss(transformer_model, en_sentences, x_en_emb, x_en_idx, y_zh_idx,
               loss):
    batch_size = x_en_emb.shape[0]
    ch2idx, idx2ch = load_ch_vocab()

    y_zh_idx_nd = nd.array(y_zh_idx, ctx=ghp.ctx)
    dec_input_zh_idx = nd.concat(
        nd.ones(shape=y_zh_idx_nd[:, :1].shape, ctx=ghp.ctx) * 2,
        y_zh_idx_nd[:, :-1],
        dim=1)

    x_en_emb = x_en_emb
    x_en_idx = x_en_idx

    output = transformer_model(x_en_emb, x_en_idx, dec_input_zh_idx, True)
    predict = nd.argmax(nd.softmax(output, axis=-1), axis=-1)

    # print("input_idx:", dec_input_zh_idx[0])
    # print("predict_idx:", predict[0])
    print("source:", en_sentences[0])

    label_token = []
    for n in range(len(y_zh_idx[0])):
        label_token.append(idx2ch[int(y_zh_idx[0][n])])
    print("target:", "".join(label_token))

    predict_token = []
    for n in range(len(predict[0])):
        predict_token.append(idx2ch[int(predict[0][n].asscalar())])
    print("predict:", "".join(predict_token))

    is_target = nd.not_equal(y_zh_idx_nd, 0)
    # print(is_target)
    current = nd.equal(y_zh_idx_nd, predict) * is_target
    acc = nd.sum(current) / nd.sum(is_target)

    l = loss(output, y_zh_idx_nd)
    l_mean = nd.sum(l) / batch_size

    return l_mean, acc
Esempio n. 10
0
         'learning_rate': lr,
         "wd": 0.001
     })
 else:
     trainer_name = "sgd"
     trainer = Trainer(model.collect_params(), trainer_name, {
         'learning_rate': lr,
         "wd": 0.001,
         "momentum": 0.8
     })
 for batch in train_data:
     x = batch.data[0].as_in_context(ctx)
     y = batch.data[1].as_in_context(ctx)
     with autograd.record(train_mode=True):
         if mask:
             _mask = nd.not_equal(x, index[pad])
             pred = model(_mask)
         else:
             pred = model(x)
         bl = loss(pred, nd.one_hot(y, len(lable_dig))).as_in_context(ctx)
         #             import pdb
         #             pdb.set_trace()
         bl.backward()
     trainer.step(batch_size)
     epoch_L += nd.sum(bl).asscalar()
 t_l, t_acc = evaluate(model, valid_data, ctx)
 model.save_parameters("clf_mxnet.params")
 msg = '[Epoch {}] , valid acc {:.6f}, valid avg loss {:.6f} with {}'.format(
     epoch, t_acc, t_l, trainer_name)
 print(msg)
 print(model)
Esempio n. 11
0
def train_and_valid(src_bert, mt_model, src_vocab, tgt_vocab, train_dataiter,
                    dev_dataiter, trainer, finetune_trainer, epochs, loss_func,
                    ctx, lr, batch_size, params_save_path_root, eval_step,
                    log_step, check_step, label_smooth, logger,
                    num_train_examples, warmup_ratio):
    batches = len(train_dataiter)

    num_train_steps = int(num_train_examples / batch_size * epochs)
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    global_step = 0
    dev_bleu_score = 0

    for epoch in range(epochs):
        for src, tgt, label, src_valid_len, tgt_valid_len in train_dataiter:
            # learning rate strategy
            if global_step < num_warmup_steps:
                new_lr = lr * global_step / num_warmup_steps
            else:
                non_warmup_steps = global_step - num_warmup_steps
                offset = non_warmup_steps / \
                    (num_train_steps - num_warmup_steps)
                new_lr = lr - offset * lr
            trainer.set_learning_rate(new_lr)

            src = src.as_in_context(ctx)
            tgt = tgt.as_in_context(ctx)
            label = label.as_in_context(ctx)
            src_valid_len = src_valid_len.as_in_context(ctx)
            src_token_type = nd.zeros_like(src, ctx=ctx)

            tgt_mask = nd.not_equal(tgt, tgt_vocab(tgt_vocab.padding_token))

            if label_smooth:
                eps = 0.1
                num_class = len(tgt_vocab.idx_to_token)
                one_hot = nd.one_hot(label, num_class)
                one_hot_label = one_hot * \
                    (1 - eps) + (1 - one_hot) * eps / num_class

            with autograd.record():
                src_bert_outputs = src_bert(src, src_token_type, src_valid_len)
                mt_outputs = mt_model(src_bert_outputs, src, tgt)
                loss_mean = loss_func(mt_outputs, one_hot_label, tgt_mask)

            loss_mean.backward()
            loss_scalar = loss_mean.asscalar()

            trainer.step(1)
            finetune_trainer.step(1)

            if global_step and global_step % log_step == 0:
                predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1)
                correct = nd.equal(label, predicts)
                accuracy = (nd.sum(correct * tgt_mask) /
                            nd.sum(tgt_mask)).asscalar()
                logger.info(
                    "epoch:{}, batch:{}/{}, bleu:{}, acc:{}, loss:{}, (lr:{}s)"
                    .format(epoch, global_step % batches, batches,
                            dev_bleu_score, accuracy, loss_scalar,
                            trainer.learning_rate))

            if global_step and global_step % check_step == 0:
                predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1)
                refer_sample = src.asnumpy().tolist()
                label_sample = label.asnumpy().tolist()
                pred_sample = predicts.asnumpy().tolist()
                logger.info("train sample:")
                logger.info("refer  :{}".format(" ".join([
                    src_vocab.idx_to_token[int(idx)] for idx in refer_sample[0]
                ])).replace(src_vocab.padding_token, ""))
                logger.info("target :{}".format(" ".join([
                    tgt_vocab.idx_to_token[int(idx)] for idx in label_sample[0]
                ])).replace(EOS, "[EOS]").replace(tgt_vocab.padding_token, ""))
                logger.info("predict:{}".format(" ".join([
                    tgt_vocab.idx_to_token[int(idx)] for idx in pred_sample[0]
                ])).replace(EOS, "[EOS]"))

            if global_step and global_step % eval_step == 0:
                dev_bleu_score = eval(src_bert,
                                      mt_model,
                                      src_vocab,
                                      tgt_vocab,
                                      dev_dataiter,
                                      logger,
                                      ctx=ctx)
                if not os.path.exists(params_save_path_root):
                    os.makedirs(params_save_path_root)
                model_params_file = params_save_path_root + \
                    "src_bert_step_{}.params".format(global_step)
                src_bert.save_parameters(model_params_file)
                logger.info("{} Save Completed.".format(model_params_file))

                model_params_file = params_save_path_root + \
                    "mt_step_{}.params".format(global_step)
                mt_model.save_parameters(model_params_file)
                logger.info("{} Save Completed.".format(model_params_file))
            writer.add_scalar("loss", loss_scalar, global_step)
            global_step += 1
Esempio n. 12
0
                                 optimizer_params=optimizer_params)
        return _trainer

    trainer = get_trainer()

    accum_loss = 0.0  # accumulate loss initialize
    for step in tqdm(range(args.max_steps),
                     leave=False,
                     total=args.max_steps,
                     desc='{},{}'.format(args.network,
                                         os.path.basename(args.output_dir))):
        step += start_step
        x, y = next(train_dataloader)
        if args.train_au:
            y = y[:, au_idx].reshape((-1, 1))
        sample_weights = nd.not_equal(y, 999)
        if args.enable_balance_sampler:
            sample_weights = sample_weights * balance_sampler(y)
        x = gluon.utils.split_and_load(x, ctx, even_split=False)
        y = gluon.utils.split_and_load(y, ctx, even_split=False)
        sample_weights = gluon.utils.split_and_load(sample_weights,
                                                    ctx,
                                                    even_split=False)

        with autograd.record(train_mode=True):
            logits = [net(data) for data in x]
            losses = [
                sigmoid_binary_cross_entropy(logit, label, sample_weight) for
                logit, label, sample_weight in zip(logits, y, sample_weights)
            ]
        for l in losses: