def test2():
    args = Args()
    ner_data_processor = TransformerNerDataProcessor()
    conll_2003 = Path(
        __file__).resolve().parent.parent.parent / "test_data/conll-2003"
    ner_data_processor.set_data_dir(conll_2003)
    labels, label2idx = ner_data_processor.get_labels(default='roberta')
    # train_examples = roberta_ner_data_processor.get_train_examples()
    train_examples = ner_data_processor.get_test_examples()
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased")
    features = transformer_convert_data_to_features(args,
                                                    train_examples[:5],
                                                    label2idx,
                                                    tokenizer,
                                                    max_seq_len=10)

    model = AlbertNerModel.from_pretrained("albert-base-v2",
                                           num_labels=len(label2idx))

    for idx, each_batch in enumerate(
            ner_data_loader(features, batch_size=5, task='test', auto=True)):
        original_mask = each_batch[1].numpy()
        print(original_mask, original_mask.shape)
        inputs = batch_to_model_inputs(each_batch)
        with torch.no_grad():
            logits, flatted_logits, loss = model(**inputs)
        logits = logits.numpy()
        print(logits)
        print(logits.shape)
        break
def test():
    from pprint import pprint
    roberta_ner_data_processor = TransformerNerDataProcessor()
    conll_2003 = Path(
        __file__).resolve().parent.parent.parent / "test_data/conll-2003"
    roberta_ner_data_processor.set_data_dir(conll_2003)
    labels, label2idx = roberta_ner_data_processor.get_labels(
        default='roberta')
    print(labels, label2idx)

    # train_examples = roberta_ner_data_processor.get_train_examples()
    train_examples = roberta_ner_data_processor.get_test_examples()
    pprint(train_examples[:5], indent=1)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased")
    features = transformer_convert_data_to_features(train_examples[:5],
                                                    label2idx,
                                                    tokenizer,
                                                    max_seq_len=10)

    model = RobertaNerModel.from_pretrained("roberta-base",
                                            num_labels=len(label2idx))
    # model = XLNetNerModel.from_pretrained("xlnet-base_uncased", num_labels=len(label2idx))

    y_trues, y_preds = [], []
    y_pred, y_true = [], []
    prev_gd = 0
    for idx, each_batch in enumerate(
            ner_data_loader(features, batch_size=5, task='test', auto=True)):
        # [idx*batch_size: (idx+1)*batch_size]
        print([(fea.input_tokens, fea.guards)
               for fea in features[idx * 2:(idx + 1) * 2]])
        print(each_batch)

        original_tkid = each_batch[0].numpy()
        original_mask = each_batch[1].numpy()
        original_labels = each_batch[3].numpy()
        guards = each_batch[4].numpy()
        print(guards)

        inputs = batch_to_model_inputs(each_batch)

        with torch.no_grad():
            logits, flatted_logits, loss = model(**inputs)
            # get softmax output of the raw logits (keep dimensions)
            raw_logits = torch.argmax(torch.nn.functional.log_softmax(logits,
                                                                      dim=2),
                                      dim=2)
            raw_logits = raw_logits.detach().cpu().numpy()

        logits = logits.numpy()
        loss = loss.numpy()

        print(logits.shape)
        # print(loss)

        # tk=token, mk=mask, lb=label, lgt=logits
        for mks, lbs, lgts, gds in zip(original_mask, original_labels,
                                       raw_logits, guards):
            connect_sent_flag = False
            for mk, lb, lgt, gd in zip(mks, lbs, lgts, gds):
                if mk == 0:  # after hit first mask, we can stop for the current sentence since all rest will be pad
                    break
                if gd == 0 or prev_gd == gd:
                    continue
                if gd == -2:
                    connect_sent_flag = True
                    break
                if prev_gd != gd:
                    y_true.append(lb)
                    y_pred.append(lgt)
                    prev_gd = gd
            if connect_sent_flag:
                continue
            y_trues.append(y_true)
            y_preds.append(y_pred)
            y_pred, y_true = [], []
            prev_gd = 0
        print(y_trues)
        print(y_preds)
def _eval(args, model, features):
    """common evaluate test data with pre-trained model shared by eval and predict"""
    data_loader = ner_data_loader(features,
                                  batch_size=args.eval_batch_size,
                                  task='test',
                                  auto=True)
    eval_size = len(data_loader)
    args.logger.info(
        "***** Running evaluation on {} number of test data *****".format(
            eval_size))
    args.logger.info("  Instantaneous batch size per GPU = {}".format(
        args.eval_batch_size))
    args.logger.info("******************************")

    # prepare processing results for each batch
    y_trues, y_preds = [], []
    y_pred, y_true = [], []
    prev_gd = 0

    # prediction
    model.eval()
    eval_loss = .0
    for batch in tqdm(data_loader,
                      desc='evaluation',
                      disable=False if args.progress_bar else True):
        original_tkid = batch[0].numpy()
        original_mask = batch[1].numpy()
        original_labels = batch[3].numpy()
        guards = batch[4].numpy()

        batch = tuple(b.to(args.device) for b in batch)
        eval_inputs = batch_to_model_inputs(batch, args.model_type)

        with torch.no_grad():
            raw_logits, _, loss = model(**eval_inputs)
            # get softmax output of the raw logits (keep dimensions)
            if not args.use_crf:
                raw_logits = torch.argmax(F.log_softmax(raw_logits, dim=2),
                                          dim=2)
            raw_logits = raw_logits.detach().cpu().numpy()
            # update evaluate loss
            eval_loss += loss.item()

        assert guards.shape == original_tkid.shape == original_mask.shape == original_labels.shape == raw_logits.shape,  \
            """
                expect same dimension for all the inputs and outputs but get
                input_tokens: {}
                mask: {}
                label: {}
                logits: {}
            """.format(original_tkid.shape, original_mask.shape, original_labels.shape, raw_logits.shape)

        # tk=token, mk=mask, lb=label, lgt=logits
        for mks, lbs, lgts, gds in zip(original_mask, original_labels,
                                       raw_logits, guards):
            connect_sent_flag = False
            for mk, lb, lgt, gd in zip(mks, lbs, lgts, gds):
                if mk == 0:  # after hit first mask, we can stop for the current sentence since all rest will be pad (not for xlnet)
                    if args.model_type == "xlnet":
                        continue
                    else:
                        break
                if gd == 0 or prev_gd == gd:
                    continue
                if gd == NEXT_GUARD:
                    connect_sent_flag = True
                    break
                if prev_gd != gd:
                    y_true.append(args.idx2label[lb])
                    y_pred.append(args.idx2label[lgt])
                    prev_gd = gd
            if connect_sent_flag:
                continue
            y_trues.append(y_true)
            y_preds.append(y_pred)
            y_pred, y_true = [], []
            prev_gd = 0

    return y_trues, y_preds, round(eval_loss / eval_size, 4)
def train(args, model, train_features, dev_features):
    """NER model training on train dataset; select model based on performance on dev dataset"""
    # create data loader
    data_loader = ner_data_loader(train_features,
                                  batch_size=args.train_batch_size,
                                  task='train',
                                  auto=True)
    # total training step counts
    t_total = len(data_loader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs

    # parameters for optimization
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)

    # using fp16 for training rely on Nvidia apex package
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # training linear warm-up setup
    scheduler = None
    if args.do_warmup:
        warmup_steps = np.dtype('int64').type(args.warmup_ratio * t_total)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)

    args.logger.info("***** Running training *****")
    args.logger.info("  Num data points = {}".format(len(data_loader)))
    args.logger.info("  Num Epochs = {}".format(args.num_train_epochs))
    args.logger.info("  Instantaneous batch size per GPU = {}".format(
        args.train_batch_size))
    args.logger.info("  Gradient Accumulation steps = {}".format(
        args.gradient_accumulation_steps))
    args.logger.info("  Total optimization steps = {}".format(t_total))
    args.logger.info(
        "  Training steps (number of steps between two evaluation on dev) = {}"
        .format(args.train_steps * args.gradient_accumulation_steps))
    args.logger.info("******************************")

    # create directory to save model
    new_model_dir = Path(args.new_model_dir)
    new_model_dir.mkdir(parents=True, exist_ok=True)
    # save label2idx json in new model directory
    json_dump(args.label2idx, new_model_dir / "label2idx.json")

    # save base model name to a base_model_name.txt
    with open(new_model_dir / "base_model_name.txt", "w") as f:
        f.write(
            'model_type: {}\nbase_model: {}\nconfig: {}\ntokenizer: {}'.format(
                args.model_type, args.pretrained_model, args.config_name,
                args.tokenizer_name))

    global_step = 0
    tr_loss = .0
    best_score, epcoh_best_score = .0, .0
    early_stop_flag = 0

    model.zero_grad()
    epoch_iter = trange(int(args.num_train_epochs),
                        desc="Epoch",
                        disable=False if args.progress_bar else True)
    for epoch in epoch_iter:
        batch_iter = tqdm(iterable=data_loader,
                          desc='Batch',
                          disable=False if args.progress_bar else True)
        for step, batch in enumerate(batch_iter):
            model.train()
            batch = tuple(b.to(args.device) for b in batch)
            train_inputs = batch_to_model_inputs(batch, args.model_type)
            _, _, loss = model(**train_inputs)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            # loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                if args.do_warmup:
                    scheduler.step()
                model.zero_grad()
                global_step += 1

            # using training step
            if args.train_steps > 0 and (
                    global_step + 1) % args.train_steps == 0 and epoch > 0:
                # the current implementation will skip the all evaluations in the first epoch
                best_score, eval_loss = evaluate(args, model, new_model_dir,
                                                 dev_features, epoch,
                                                 global_step, best_score)
                args.logger.info("""
                Global step: {}; 
                Epoch: {}; 
                average_train_loss: {:.4f}; 
                eval_loss: {:.4f}; 
                current best score: {:.4f}""".format(
                    global_step, epoch + 1, round(tr_loss / global_step, 4),
                    eval_loss, best_score))

        # default model select method using strict F1-score with beta=1; evaluate model after each epoch on dev
        if args.train_steps <= 0 or epoch == 0:
            best_score, eval_loss = evaluate(args, model, new_model_dir,
                                             dev_features, epoch, global_step,
                                             best_score)
            args.logger.info("""
                Global step: {}; 
                Epoch: {}; 
                average_train_loss: {:.4f}; 
                eval_loss: {:.4f}; 
                current best score: {:.4f}""".format(
                global_step, epoch + 1, round(tr_loss / global_step, 4),
                eval_loss, best_score))

        # early stop check
        if epcoh_best_score < best_score:
            epcoh_best_score = best_score
            early_stop_flag = 0
        else:
            early_stop_flag += 1

        if 0 < args.early_stop <= early_stop_flag:
            args.logger.warn(
                'Early stop activated; performance not improve anymore.')
            break