def _get_non_pad_mask(self, seq, pad_idx=None): if pad_idx: non_pad_mask = nd.not_equal(seq, pad_idx) else: non_pad_mask = nd.not_equal(seq, 0) non_pad_mask = nd.expand_dims(non_pad_mask, axis=2) return non_pad_mask
def dev(ch_bert, model, ch_vocab, dev_dataiter, logger, ctx): TP_s = 0 FP_s = 0 FN_s = 0 example_ids = [] for content, token_types, valid_len, label, example_id in tqdm( dev_dataiter): example_ids.extend(example_id) content = content.as_in_context(ctx) token_types = token_types.as_in_context(ctx) valid_len = valid_len.as_in_context(ctx) label = label.as_in_context(ctx) output = model(content, token_types, valid_len) predict = nd.argmax(nd.softmax(output, axis=-1), axis=-1) label = label.as_in_context(ctx) tp_s = int(nd.sum(nd.equal(predict, label)).asscalar()) fp_s = int( nd.sum(nd.not_equal(predict, label) * nd.equal(label, 0)).asscalar()) fn_s = int( nd.sum(nd.not_equal(predict, label) * nd.equal(label, 1)).asscalar()) TP_s += tp_s FP_s += fp_s FN_s += fn_s P_s = TP_s / (TP_s + FP_s) R_s = TP_s / (TP_s + FN_s) F = (2 * P_s * R_s) / (P_s + R_s) logger.info("F:{}".format(F)) return F
def _get_key_mask(self, enc_idx, dec_idx, pad_idx=None): seq_len = dec_idx.shape[1] if pad_idx: pad_mask = nd.not_equal(enc_idx, pad_idx) else: pad_mask = nd.not_equal(enc_idx, 0) pad_mask = nd.expand_dims(pad_mask, axis=1) pad_mask = nd.broadcast_axes(pad_mask, axis=1, size=seq_len) return pad_mask
def eval(en_bert, mt_model, en_vocab, ch_vocab, dev_dataiter, logger, ctx): references = [] hypothesis = [] score = 0 chencherry = SmoothingFunction() for trans, _, label, trans_valid_len, label_valid_len in tqdm( dev_dataiter): trans = trans.as_in_context(ctx) trans_valid_len = trans_valid_len.as_in_context(ctx) batch_size = trans.shape[0] trans_token_type = nd.zeros_like(trans) en_bert_outputs = en_bert(trans, trans_token_type, trans_valid_len) ch_sentences = [BOS] aim = ch_vocab[ch_sentences] aim = nd.array([aim], ctx=ctx) aim = nd.broadcast_axes(aim, axis=0, size=batch_size) for n in range(0, args.max_ch_len): mt_outputs = mt_model(en_bert_outputs, trans, aim) predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1) final_predict = predicts[:, -1:] aim = nd.concat(aim, final_predict, dim=1) label = label.asnumpy().tolist() predict_valid_len = nd.sum(nd.not_equal( predicts, ch_vocab(ch_vocab.padding_token)), axis=-1).asnumpy().tolist() predicts = aim[:, 1:].asnumpy().tolist() label_valid_len = label_valid_len.asnumpy().tolist() for refer, hypoth, l_v_len, p_v_len in zip(label, predicts, label_valid_len, predict_valid_len): l_v_len = int(l_v_len) p_v_len = int(p_v_len) refer = refer[:l_v_len] refer_str = [ch_vocab.idx_to_token[int(idx)] for idx in refer] hypoth_str = [ch_vocab.idx_to_token[int(idx)] for idx in hypoth] hypoth_str_valid = [] for token in hypoth_str: if token == EOS: hypoth_str_valid.append(token) break hypoth_str_valid.append(token) references.append(refer_str) hypothesis.append(hypoth_str_valid) for refer, hypoth in zip(references, hypothesis): score += sentence_bleu([refer], hypoth, smoothing_function=chencherry.method1) logger.info("dev sample:") logger.info("refer :{}".format(" ".join(references[0]).replace( EOS, "[EOS]").replace(ch_vocab.padding_token, ""))) logger.info("hypoth:{}".format(" ".join(hypothesis[0]).replace( EOS, "[EOS]"))) return score / len(references)
def getMask(q_seq, k_seq): # q_seq shape : (batch_size, q_seq_len) # k_seq shape : (batch_size, k_seq_len) q_len = q_seq.shape[1] pad_mask = nd.not_equal(k_seq, 0) pad_mask = nd.expand_dims(pad_mask, axis=1) pad_mask = nd.broadcast_axes(pad_mask, axis=1, size=q_len) return pad_mask
def hard_example_mining(dist_mat, labels, return_inds=False): """For each anchor, find the hardest positive and negative sample. Args: dist_mat: pytorch Variable, pair wise distance between samples, shape [N, N] labels: pytorch LongTensor, with shape [N] return_inds: whether to return the indices. Save time if `False`(?) Returns: dist_ap: pytorch Variable, distance(anchor, positive); shape [N] dist_an: pytorch Variable, distance(anchor, negative); shape [N] p_inds: pytorch LongTensor, with shape [N]; indices of selected hard positive samples; 0 <= p_inds[i] <= N - 1 n_inds: pytorch LongTensor, with shape [N]; indices of selected hard negative samples; 0 <= n_inds[i] <= N - 1 NOTE: Only consider the case in which all labels have same num of samples, thus we can cope with all anchors in parallel. """ assert len(dist_mat.shape) == 2 assert dist_mat.shape[0] == dist_mat.shape[1] N = dist_mat.shape[0] # shape [N, N] is_pos = nd.equal(labels.broadcast_to((N, N)), labels.broadcast_to((N, N)).T).astype('float32') is_neg = nd.not_equal(labels.broadcast_to((N, N)), labels.broadcast_to((N, N)).T).astype('float32') # `dist_ap` means distance(anchor, positive) # both `dist_ap` and `relative_p_inds` with shape [N, 1] dist_pos = dist_mat * is_pos dist_ap = nd.max(dist_pos, axis=1) # `dist_an` means distance(anchor, negative) # both `dist_an` and `relative_n_inds` with shape [N, 1] dist_neg = dist_mat * is_neg + nd.max(dist_mat, axis=1, keepdims=True) * is_pos dist_an = nd.min(dist_neg, axis=1) # shape [N] # if return_inds: # # shape [N, N] # ind = (labels.new().resize_as_(labels) # .copy_(torch.arange(0, N).long()) # .unsqueeze(0).expand(N, N)) # # shape [N, 1] # p_inds = torch.gather( # ind[is_pos].contiguous().view(N, -1), 1, relative_p_inds.data) # n_inds = torch.gather( # ind[is_neg].contiguous().view(N, -1), 1, relative_n_inds.data) # # shape [N] # p_inds = p_inds.squeeze(1) # n_inds = n_inds.squeeze(1) # return dist_ap, dist_an, p_inds, n_inds return dist_ap, dist_an
def hard_example_mining(dist_mat, labels): assert len(dist_mat.shape) == 2 assert dist_mat.shape[0] == dist_mat.shape[1] N = dist_mat.shape[0] # shape [N, N] is_pos = nd.equal(labels.broadcast_to((N, N)), labels.broadcast_to((N, N)).T).astype('float32') is_neg = nd.not_equal(labels.broadcast_to((N, N)), labels.broadcast_to((N, N)).T).astype('float32') dist_pos = dist_mat * is_pos dist_ap = nd.max(dist_pos, axis=1) dist_neg = dist_mat * is_neg + nd.max(dist_mat, axis=1, keepdims=True) * is_pos dist_an = nd.min(dist_neg, axis=1) return dist_ap, dist_an
def evaluate(model, dataIterator, ctx, pad=None): """ The Evaluation function :param model: model object :param dataIterator: data iterator in mxnet :param ctx: context :param weight: NDArray weight matrix of Weighted SCE :param pad: Int padding id :param report: Boolean F1 Score report Matrix :return: """ loss = gloss.SoftmaxCrossEntropyLoss(sparse_label=False) if pad is not None: mask = True else: mask = False dataIterator.reset() total_loss = 0.0 total_sample_num = 0 y_pred, y_true = [], [] for i, batch in enumerate(dataIterator): x = batch.data[0].as_in_context(ctx) y = batch.data[1].as_in_context(ctx) if mask: _mask = nd.not_equal(x, pad) pred = model(_mask) else: pred = model(x) bl = loss(pred, nd.one_hot(y,238)).as_in_context(ctx) total_sample_num = x.shape[0] total_loss += nd.sum(bl).asscalar() pred = nd.argmax(pred, axis=1) y_pred.extend(pred.asnumpy().tolist()) y_true.extend(y.asnumpy().tolist()) acc = metrics.accuracy_score(y_pred, y_true) # f1 = metrics.f1_score(y_pred, y_true, average='macro') avg_L = total_loss / float(total_sample_num) # if report: # return avg_L, acc, f1, metrics.classification_report(y_true, y_pred) # else: # return avg_L, acc, f1 return avg_L, acc
def batch_loss(transformer_model, en_sentences, x_en_emb, x_en_idx, y_zh_idx, loss): batch_size = x_en_emb.shape[0] ch2idx, idx2ch = load_ch_vocab() y_zh_idx_nd = nd.array(y_zh_idx, ctx=ghp.ctx) dec_input_zh_idx = nd.concat( nd.ones(shape=y_zh_idx_nd[:, :1].shape, ctx=ghp.ctx) * 2, y_zh_idx_nd[:, :-1], dim=1) x_en_emb = x_en_emb x_en_idx = x_en_idx output = transformer_model(x_en_emb, x_en_idx, dec_input_zh_idx, True) predict = nd.argmax(nd.softmax(output, axis=-1), axis=-1) # print("input_idx:", dec_input_zh_idx[0]) # print("predict_idx:", predict[0]) print("source:", en_sentences[0]) label_token = [] for n in range(len(y_zh_idx[0])): label_token.append(idx2ch[int(y_zh_idx[0][n])]) print("target:", "".join(label_token)) predict_token = [] for n in range(len(predict[0])): predict_token.append(idx2ch[int(predict[0][n].asscalar())]) print("predict:", "".join(predict_token)) is_target = nd.not_equal(y_zh_idx_nd, 0) # print(is_target) current = nd.equal(y_zh_idx_nd, predict) * is_target acc = nd.sum(current) / nd.sum(is_target) l = loss(output, y_zh_idx_nd) l_mean = nd.sum(l) / batch_size return l_mean, acc
'learning_rate': lr, "wd": 0.001 }) else: trainer_name = "sgd" trainer = Trainer(model.collect_params(), trainer_name, { 'learning_rate': lr, "wd": 0.001, "momentum": 0.8 }) for batch in train_data: x = batch.data[0].as_in_context(ctx) y = batch.data[1].as_in_context(ctx) with autograd.record(train_mode=True): if mask: _mask = nd.not_equal(x, index[pad]) pred = model(_mask) else: pred = model(x) bl = loss(pred, nd.one_hot(y, len(lable_dig))).as_in_context(ctx) # import pdb # pdb.set_trace() bl.backward() trainer.step(batch_size) epoch_L += nd.sum(bl).asscalar() t_l, t_acc = evaluate(model, valid_data, ctx) model.save_parameters("clf_mxnet.params") msg = '[Epoch {}] , valid acc {:.6f}, valid avg loss {:.6f} with {}'.format( epoch, t_acc, t_l, trainer_name) print(msg) print(model)
def train_and_valid(src_bert, mt_model, src_vocab, tgt_vocab, train_dataiter, dev_dataiter, trainer, finetune_trainer, epochs, loss_func, ctx, lr, batch_size, params_save_path_root, eval_step, log_step, check_step, label_smooth, logger, num_train_examples, warmup_ratio): batches = len(train_dataiter) num_train_steps = int(num_train_examples / batch_size * epochs) num_warmup_steps = int(num_train_steps * warmup_ratio) global_step = 0 dev_bleu_score = 0 for epoch in range(epochs): for src, tgt, label, src_valid_len, tgt_valid_len in train_dataiter: # learning rate strategy if global_step < num_warmup_steps: new_lr = lr * global_step / num_warmup_steps else: non_warmup_steps = global_step - num_warmup_steps offset = non_warmup_steps / \ (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) src = src.as_in_context(ctx) tgt = tgt.as_in_context(ctx) label = label.as_in_context(ctx) src_valid_len = src_valid_len.as_in_context(ctx) src_token_type = nd.zeros_like(src, ctx=ctx) tgt_mask = nd.not_equal(tgt, tgt_vocab(tgt_vocab.padding_token)) if label_smooth: eps = 0.1 num_class = len(tgt_vocab.idx_to_token) one_hot = nd.one_hot(label, num_class) one_hot_label = one_hot * \ (1 - eps) + (1 - one_hot) * eps / num_class with autograd.record(): src_bert_outputs = src_bert(src, src_token_type, src_valid_len) mt_outputs = mt_model(src_bert_outputs, src, tgt) loss_mean = loss_func(mt_outputs, one_hot_label, tgt_mask) loss_mean.backward() loss_scalar = loss_mean.asscalar() trainer.step(1) finetune_trainer.step(1) if global_step and global_step % log_step == 0: predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1) correct = nd.equal(label, predicts) accuracy = (nd.sum(correct * tgt_mask) / nd.sum(tgt_mask)).asscalar() logger.info( "epoch:{}, batch:{}/{}, bleu:{}, acc:{}, loss:{}, (lr:{}s)" .format(epoch, global_step % batches, batches, dev_bleu_score, accuracy, loss_scalar, trainer.learning_rate)) if global_step and global_step % check_step == 0: predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1) refer_sample = src.asnumpy().tolist() label_sample = label.asnumpy().tolist() pred_sample = predicts.asnumpy().tolist() logger.info("train sample:") logger.info("refer :{}".format(" ".join([ src_vocab.idx_to_token[int(idx)] for idx in refer_sample[0] ])).replace(src_vocab.padding_token, "")) logger.info("target :{}".format(" ".join([ tgt_vocab.idx_to_token[int(idx)] for idx in label_sample[0] ])).replace(EOS, "[EOS]").replace(tgt_vocab.padding_token, "")) logger.info("predict:{}".format(" ".join([ tgt_vocab.idx_to_token[int(idx)] for idx in pred_sample[0] ])).replace(EOS, "[EOS]")) if global_step and global_step % eval_step == 0: dev_bleu_score = eval(src_bert, mt_model, src_vocab, tgt_vocab, dev_dataiter, logger, ctx=ctx) if not os.path.exists(params_save_path_root): os.makedirs(params_save_path_root) model_params_file = params_save_path_root + \ "src_bert_step_{}.params".format(global_step) src_bert.save_parameters(model_params_file) logger.info("{} Save Completed.".format(model_params_file)) model_params_file = params_save_path_root + \ "mt_step_{}.params".format(global_step) mt_model.save_parameters(model_params_file) logger.info("{} Save Completed.".format(model_params_file)) writer.add_scalar("loss", loss_scalar, global_step) global_step += 1
optimizer_params=optimizer_params) return _trainer trainer = get_trainer() accum_loss = 0.0 # accumulate loss initialize for step in tqdm(range(args.max_steps), leave=False, total=args.max_steps, desc='{},{}'.format(args.network, os.path.basename(args.output_dir))): step += start_step x, y = next(train_dataloader) if args.train_au: y = y[:, au_idx].reshape((-1, 1)) sample_weights = nd.not_equal(y, 999) if args.enable_balance_sampler: sample_weights = sample_weights * balance_sampler(y) x = gluon.utils.split_and_load(x, ctx, even_split=False) y = gluon.utils.split_and_load(y, ctx, even_split=False) sample_weights = gluon.utils.split_and_load(sample_weights, ctx, even_split=False) with autograd.record(train_mode=True): logits = [net(data) for data in x] losses = [ sigmoid_binary_cross_entropy(logit, label, sample_weight) for logit, label, sample_weight in zip(logits, y, sample_weights) ] for l in losses: