def test_adam(self):
     w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
     target = torch.tensor([0.4, 0.2, -0.5])
     criterion = torch.nn.MSELoss()
     # No warmup, constant schedule, no gradient clipping
     optimizer = BertAdam(params=[w], lr=2e-1,
                                       weight_decay=0.0,
                                       max_grad_norm=-1)
     for _ in range(100):
         loss = criterion(w, target)
         loss.backward()
         optimizer.step()
         w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
         w.grad.zero_()
     self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default="../bert_pytorch/tasks/MultipleChoice/swag_data/",
        type=str,
        required=False,
        help=
        "The input squad_data dir. Should contain the .csv files (or other squad_data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default='converted/base-uncased',
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default='tasks/MultipleChoice/swag_output/',
        type=str,
        required=False,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=80,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--vocab_size",
                        default=30522,
                        type=int,
                        help="The size of vocabulary.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=4,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    ###### config setting ######

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    ###### fastNLP.DataSet loading ######

    train_data, dev_data = load_dataset(args)

    ###### model initializing ######

    config = json.load(open(os.path.join(args.bert_model, BERT_CONFIG), "r"))
    model = BertMC(args.vocab_size, num_choices=4, **config)
    model.load(os.path.join(args.bert_model, MODEL_NAME))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    ###### ptimizer initializing ######

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.a_2', 'LayerNorm.b_2']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = args.num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if args.do_train:
        criterion = nn.CrossEntropyLoss()
        train_dataloader = DataLoader(train_data,
                                      sampler=RandomSampler(train_data),
                                      batch_size=args.train_batch_size)
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):

            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)['pred']

                loss = criterion(logits, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model.load(output_model_file)
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):

        eval_dataloader = DataLoader(dev_data,
                                     sampler=SequentialSampler(dev_data),
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                # TODO
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)['pred']
                tmp_eval_loss = criterion(logits, label_ids)
                if n_gpu > 1:
                    tmp_eval_loss = tmp_eval_loss.mean()

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default='pretrained/bert-base-uncased',
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default='tasks/QuestionAnswering/squad_output',
        type=str,
        required=False,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--train_file",
        default='tasks/QuestionAnswering/squad_data/train-v1.1.json',
        type=str,
        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default='tasks/QuestionAnswering/squad_data/dev-v1.1.json',
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument("--vocab_size",
                        default=30522,
                        type=int,
                        help="The size of vocabulary.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument(
        "--do_train", default=1, type=int,
        help="Whether to run training.")  # , action='store_true'
    parser.add_argument(
        "--do_predict",
        default=1,
        type=int,
        help="Whether to run eval on the dev set.")  # , action='store_true'
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=2.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to squad_data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=83,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=1,
        type=int,
        # action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    args.train_batch_size = int(args.train_batch_size)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_data, dev_data = load_dataset(args)

    # Prepare model
    config = json.load(open(os.path.join(args.bert_model, BERT_CONFIG), "r"))
    model = BertQA(args.vocab_size, **config)
    model.load(os.path.join(args.bert_model, MODEL_NAME))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.a_2', 'LayerNorm.b_2']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = args.num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if args.do_train:
        criterion = nn.CrossEntropyLoss()

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(
                    t.to(device)
                    for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)
                logits_start = logits['pred_start']
                logtis_end = logits['pred_end']

                ignored_index = logits_start.size(1)
                start_positions.clamp_(0, ignored_index)
                end_positions.clamp_(0, ignored_index)

                loss = (criterion(logits_start, start_positions) +
                        criterion(logtis_end, end_positions)) / 2

                if n_gpu > 1:
                    loss = loss.mean()

                loss.backward()
                if (step + 1) % 1 == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model.load(output_model_file)
    model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        # Run prediction for full squad_data
        eval_sampler = SequentialSampler(dev_data)
        eval_dataloader = DataLoader(dev_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                logits = model(x=input_ids,
                               segment_info=segment_ids,
                               mask=input_mask)
                batch_start_logits = logits['pred_start']
                batch_end_logits = logits['pred_end']

            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
Example #4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The train file path")
    parser.add_argument("--eval_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The dev file path")
    parser.add_argument("--predict_file",
                        default=None,
                        type=str,
                        required=False,
                        help="The predict file path")
    parser.add_argument("--predict_result_file",
                        default='datas/result.csv',
                        type=str,
                        required=False,
                        help="The predict result file path")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=250,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--load_checkpoint",
                        default=False,
                        action='store_true',
                        help="Whether to run load checkpoint.")
    parser.add_argument("--num_labels",
                        default=1,
                        type=int,
                        help="mapping classify nums")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--epoches",
                        default=6,
                        type=int,
                        help="Total epoch numbers for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=6.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )

    args = parser.parse_args()
    vocab_path = os.path.join(args.bert_model, VOCAB_NAME)
    # bert_config = BertConfig.from_json_file(vocab_path)
    data_processor = DataProcessor()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # if args.do_train:
    #     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #         raise ValueError("Output directory ({}) already exists and is not empty.".format(
    #             args.output_dir))
    #     else:
    #         os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path,
                                           do_lower_case=args.do_lower_case)
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          num_labels=3)
    for k, v in model.state_dict().items():
        print(f'k = {k}, v.grad = {v.grad}')
    model.to(device)

    # model = torch.nn.DataParallel(model)

    def evaluating(model, eval_dataloader):
        model.eval()
        eval_loss = 0
        logits, labels = [], []
        for step, batch in enumerate(eval_dataloader):
            input_ids, input_mask, segment_ids, label_ids = [
                b.to(device) for b in batch
            ]
            with torch.no_grad():
                loss, logit = model(input_ids, segment_ids, input_mask,
                                    label_ids)
                loss = loss.mean()
            eval_loss = loss * args.gradient_accumulation_steps if step == 0 else eval_loss + loss * args.gradient_accumulation_steps
            logit = torch.argmax(logit, dim=-1)
            logits.extend(logit.tolist())
            labels.extend(label_ids.tolist())
        return (eval_loss.item() / step, logits, labels)

    def predicting(model, dataloader):
        model.eval()
        logits, example_ids = [], []
        for step, batch in enumerate(dataloader):
            if step % 100 == 0:
                print(f'当前预测进度: {step}/{len(dataloader)}')
            input_ids, input_mask, segment_ids, label_ids = [
                b.to(device) for b in batch
            ]
            with torch.no_grad():
                logit = model(input_ids, segment_ids, input_mask)
            logit = torch.argmax(logit, dim=-1)
            logits.extend(logit.tolist())
            example_ids.extend(label_ids.tolist())
        return logits, example_ids

    def eval_meric(model, data_loader):
        eval_loss, all_logits, all_labels = evaluating(model, data_loader)
        accuracy(all_labels, all_logits)
        logger.info(f'Average eval loss = {eval_loss}')
        return eval_loss

    def write_predict_file(model, data_loader, file_path):
        """
        写入预测文件: 格式:'五彩滨云-final.csv'
        """
        logits, ids = predicting(model, data_loader)
        assert len(ids) == len(logits)
        logger.info(
            f'zero nums {logits.count(0)}, one nums {logits.count(1)}, two nums {logits.count(2)}'
        )
        labels = [
            data_processor.eval_dict[id][1] for id, logit in zip(ids, logits)
        ]
        # if not args.do_eval:
        #     logits = [i - 1 for i in logits]
        #     data_df = pd.DataFrame({'id': ids, 'y': logits})
        # else:
        assert len(labels) == len(logits)
        # accuracy(labels, logits)
        passages = [
            data_processor.eval_dict[id][0] for id, logit in zip(ids, logits)
        ]
        autors = [
            data_processor.eval_dict[id][2] for id, logit in zip(ids, logits)
        ]
        like_counts = [
            data_processor.eval_dict[id][3] for id, logit in zip(ids, logits)
        ]
        times = [
            data_processor.eval_dict[id][4] for id, logit in zip(ids, logits)
        ]

        assert len(labels) == len(passages)
        match_array = np.array((logits)) == np.array(labels)
        match_list = match_array.tolist()
        data_df = pd.DataFrame({
            'id': ids,
            'pred': logits,
            'time': times,
            'match': '',
            'autors': autors,
            'like_counts': like_counts,
            'passage': passages
        })
        data_df.to_csv(file_path, index=None)

    eval_examples = data_processor.get_examples(args.eval_file,
                                                data_type='eval')

    eval_features = convert_examples_to_features(args, eval_examples,
                                                 args.max_seq_length,
                                                 tokenizer)
    eval_loader = ParaDataloader(eval_features)
    eval_loader = DataLoader(eval_loader,
                             shuffle=False,
                             batch_size=args.eval_batch_size)

    if 0:
        # 数据读取
        train_examples = data_processor.get_examples(args.train_file,
                                                     data_type='train')

        # 特征转换
        train_features = convert_examples_to_features(args, train_examples,
                                                      args.max_seq_length,
                                                      tokenizer)

        num_train_steps = int(
            len(train_features) // args.train_batch_size //
            args.gradient_accumulation_steps * args.num_train_epochs)

        # 数据loader
        train_loader = ParaDataloader(train_features)

        # 数据并行loader输入格式
        train_loader = DataLoader(train_loader,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)

        model.zero_grad()
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)
        tr_loss = None
        for epoch in range(args.epoches):
            model.train()
            min_eval_loss = 10000
            for step, batch in enumerate(train_loader):
                input_ids, input_mask, segment_ids, label_ids = [
                    b.to(device) for b in batch
                ]

                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                loss = loss.mean()
                print(f'loss = {loss}')
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps
                optimizer.step()
                optimizer.zero_grad()
                if step % 1000 == 1:
                    eval_loss = eval_meric(model, eval_loader)
                    if eval_loss < min_eval_loss:
                        save_checkpoint(model, epoch, args.output_dir)

    if args.do_predict:
        if args.load_checkpoint:
            state_dict = torch.load('output/pytorch_model-0004.bin')
            model.load_state_dict(state_dict)
        logger.info(f'Start to predict......')
        if args.do_eval:
            predict_examples = data_processor.get_eval_examples(args.eval_file)
        else:
            predict_examples = data_processor.get_predict_examples(
                args.predict_file)

        predict_features = convert_examples_to_features(
            args, predict_examples, args.max_seq_length, tokenizer)
        predict_loader = ParaDataloader(predict_features)
        predict_loader = DataLoader(predict_loader,
                                    shuffle=False,
                                    batch_size=args.eval_batch_size)
        write_predict_file(model, predict_loader, args.predict_result_file)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--word_embedding_file",
                        default='./emb/wiki-news-300d-1M.vec',
                        type=str,
                        help="The input directory of word embeddings.")
    parser.add_argument("--index_path",
                        default='./emb/p_index.bin',
                        type=str,
                        help="The input directory of word embedding index.")
    parser.add_argument("--word_embedding_info",
                        default='./emb/vocab_info.txt',
                        type=str,
                        help="The input directory of word embedding info.")
    parser.add_argument("--data_file",
                        default='',
                        type=str,
                        help="The input directory of input data file.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--max_ngram_length",
                        default=16,
                        type=int,
                        help="The maximum total ngram sequence")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--embedding_size",
                        default=300,
                        type=int,
                        help="Total batch size for embeddings.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_eval_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of eval epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--single',
                        action='store_true',
                        help="Whether only evaluate a single epoch")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()
    # Comment the if else block for no CUDA
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu") # uncomment this for no GPU
    logger.info(
        "device: {} , distributed training: {}, 16-bits training: {}".format(
            device, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:  # Comment this to No GPU
        torch.cuda.manual_seed_all(args.seed)  # Comment this for No GPU

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    w2i, i2w, vocab_size = {}, {}, 1
    if args.do_train:

        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

        train_features, w2i, i2w, vocab_size = convert_examples_to_features_disc_train(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_tokens = torch.tensor([f.token_ids for f in train_features],
                                  dtype=torch.long)
        all_label_id = torch.tensor([f.label_id for f in train_features],
                                    dtype=torch.long)

    # load embeddings sa
    if args.do_train:
        logger.info("Loading word embeddings ... ")
        emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(
            args.word_embedding_file)
        if not os.path.exists(args.index_path):

            write_vocab_info(args.word_embedding_info, emb_vocab_size,
                             vocab_list)
            p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec,
                                               args.index_path)
        else:
            #emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info)
            p = load_embedding_index(args.index_path,
                                     emb_vocab_size,
                                     num_dim=args.embedding_size)
        #emb_dict, emb_vec, vocab_list, emb_vocab_size, p = None, None, None, None, None

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))
    model = BertForDiscriminator.from_pretrained(args.bert_model,
                                                 cache_dir=cache_dir,
                                                 num_labels=num_labels)
    model.to(device)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:  # Comment this for NO GPU
        model = torch.nn.DataParallel(model)  # Comment this for NO GPU

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 1
    tr_loss = 0
    if args.do_train:

        train_data = TensorDataset(all_tokens, all_label_id)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ind in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            flaw_eval_f1 = []
            flaw_eval_recall = []
            flaw_eval_precision = []
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                tokens, _ = batch  #, label_id, ngram_ids, ngram_labels, ngram_masks

                # module1: learn a discriminator
                tokens = tokens.to('cpu').numpy()
                #print("PRINTING TOKENS!!!!!!!!! ", len(tokens[0]))
                train_features = convert_examples_to_features_flaw(
                    tokens, args.max_seq_length, args.max_ngram_length,
                    tokenizer, i2w, emb_dict, p, vocab_list)

                flaw_mask = torch.tensor([f.flaw_mask for f in train_features],
                                         dtype=torch.long).to(
                                             device)  # [1, 1, 1, 1, 0,0,0,0]
                flaw_ids = torch.tensor([f.flaw_ids for f in train_features],
                                        dtype=torch.long).to(
                                            device)  # [12,25,37,54,0,0,0,0]
                flaw_labels = torch.tensor(
                    [f.flaw_labels for f in train_features],
                    dtype=torch.long).to(device)  # [0, 1, 1, 1, 0,0,0,0]

                loss, logits = model(flaw_ids, flaw_mask, flaw_labels)
                logits = logits.detach().cpu().numpy()

                if n_gpu > 1:  # Comment this for NO GPU
                    loss = loss.mean()  # Comment this for NO GPU

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()

                nb_tr_examples += flaw_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:

                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                # eval during training
                flaw_labels = flaw_labels.to('cpu').numpy()

                flaw_tmp_eval_f1, flaw_tmp_eval_recall, flaw_tmp_eval_precision = f1_3d(
                    logits, flaw_labels)
                flaw_eval_f1.append(flaw_tmp_eval_f1)
                flaw_eval_recall.append(flaw_tmp_eval_recall)
                flaw_eval_precision.append(flaw_tmp_eval_precision)

                nb_eval_examples += flaw_ids.size(0)
                nb_eval_steps += 1

            flaw_f1 = sum(flaw_eval_f1) / len(flaw_eval_f1)
            flaw_recall = sum(flaw_eval_recall) / len(flaw_eval_recall)
            flaw_precision = sum(flaw_eval_precision) / len(
                flaw_eval_precision)
            loss = tr_loss / nb_tr_steps if args.do_train else None
            result = {
                'flaw_f1': flaw_f1,
                "flaw_recall": flaw_recall,
                "flaw_precision": flaw_precision,
                'loss': loss,
            }

            output_eval_file = os.path.join(args.output_dir,
                                            "train_results.txt")
            with open(output_eval_file, "a") as writer:
                #logger.info("***** Training results *****")
                writer.write("epoch" + str(ind) + '\n')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('\n')

            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir,
                                             "epoch" + str(ind) + WEIGHTS_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

        os.rename(
            output_model_file,
            os.path.join(args.output_dir, "disc_trained_" + WEIGHTS_NAME))
        current_path = os.path.join(args.output_dir,
                                    "disc_trained_" + WEIGHTS_NAME)
        new_path = os.path.join('./models', "disc_trained_" + WEIGHTS_NAME)
        new_path_config = os.path.join('./models' + CONFIG_NAME)
        shutil.move(current_path, new_path)
        shutil.move(output_config_file, new_path_config)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank()
                         == 0):  # for trouble-shooting

        eval_examples = processor.get_disc_dev_examples(args.data_file)
        eval_features, w2i, i2w, vocab_size = convert_examples_to_features_disc_eval(
            eval_examples, label_list, args.max_seq_length, tokenizer, w2i,
            i2w, vocab_size)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_token_ids = torch.tensor([f.token_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features],
                                       dtype=torch.long)
        all_flaw_ids = torch.tensor([f.flaw_ids for f in eval_features],
                                    dtype=torch.long)
        all_label_id = torch.tensor([f.label_id for f in eval_features],
                                    dtype=torch.long)
        all_chunks = torch.tensor([f.chunks for f in eval_features],
                                  dtype=torch.long)
        #print("flaw ids in eval_features: ", all_flaw_ids)

        eval_data = TensorDataset(all_token_ids, all_input_ids, all_input_mask,
                                  all_flaw_ids, all_flaw_labels, all_label_id,
                                  all_chunks)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Load a trained model and config that you have fine-tuned
        if args.single:
            eval_range = trange(int(args.num_eval_epochs),
                                int(args.num_eval_epochs + 1),
                                desc="Epoch")
        else:
            eval_range = trange(int(args.num_eval_epochs), desc="Epoch")

        attack_type = 'rand'
        for epoch in eval_range:

            output_file = os.path.join(
                args.data_dir, "epoch" + str(epoch) + "disc_eval_outputs_" +
                attack_type + ".tsv")
            with open(output_file, "w") as csv_file:
                writer = csv.writer(csv_file, delimiter='\t')
                writer.writerow(["sentence", "label", "ids"])

            #output_model_file = os.path.join(args.output_dir, "epoch"+str(epoch)+WEIGHTS_NAME)
            output_model_file = os.path.join(args.output_dir,
                                             "disc_trained_" + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            #print("output_model_file: ", output_model_file)
            config = BertConfig(output_config_file)
            model = BertForDiscriminator(config, num_labels=num_labels)
            model.load_state_dict(torch.load(output_model_file))

            model.to(device)
            model.eval()
            predictions, truths = [], []
            eval_loss, nb_eval_steps, nb_eval_examples = 0, 0, 0
            eval_accuracy = 0

            for token_ids, input_ids, input_mask, flaw_ids, flaw_labels, label_id, chunks in tqdm(
                    eval_dataloader, desc="Evaluating"):

                token_ids = token_ids.to(device)
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                flaw_labels = flaw_labels.to(device)
                flaw_ids = flaw_ids.to(device)

                #print("flaw ids in eval_dataloader: ", flaw_ids)

                with torch.no_grad():
                    tmp_eval_loss, s = model(input_ids, input_mask,
                                             flaw_labels)

                    #                     print("tmp_eval_loss: ",tmp_eval_loss)
                    #                     print("s: ",s)

                    logits = model(input_ids, input_mask)

                    print("len of logits: ", len(logits))
                    print("shape of logits: ", logits.size())
                    print("type of logits: ", type(logits))
                    print("type of logits: ", logits)

                    flaw_logits = torch.argmax(logits, dim=2)

                    print("Type of flaw_logits: ", type(flaw_logits))
                    print("shape of flaw_logits: ", flaw_logits.size())
                    print("Length of flaw_logits: ", len(flaw_logits))
                    print("flaw_logits: ", flaw_logits)

                logits = logits.detach().cpu().numpy()
                flaw_logits = flaw_logits.detach().cpu().numpy()
                flaw_ids = flaw_ids.to('cpu').numpy()
                label_id = label_id.to('cpu').numpy()
                chunks = chunks.to('cpu').numpy()
                token_ids = token_ids.to('cpu').numpy()

                flaw_logits = logit_converter(
                    flaw_logits, chunks)  # each word only has one '1'

                print("Type of flaw_logits logit_converter: ",
                      type(flaw_logits))
                #print("shape of flaw_logits logit_converter : ",flaw_logits.size())
                print("Length of flaw_logits logit_converter : ",
                      len(flaw_logits))
                print("flaw_logits logit_converter : ", flaw_logits)

                true_logits = []

                #print("length of flaw_ids: ",len(flaw_ids))

                for i in range(len(flaw_ids)):
                    tmp = [0] * len(flaw_logits[i])

                    #print("tmp: ",tmp) # ne line
                    #print("printing i:",i)
                    #print("len of tmp: ",len(tmp))
                    #print("length of flaw_ids of i : ",len(flaw_ids[i]))
                    #print("flaw_ids[i]: ",flaw_ids[i])

                    for j in range(len(flaw_ids[0])):
                        #print("flaw_ids[i][j] : ",flaw_ids[i][j])
                        #print("tmp value: ", tmp)
                        #print("tmp len: ", len(tmp))
                        if flaw_ids[i][j] == 0: break
                        if flaw_ids[i][j] >= len(tmp): continue
                        tmp[flaw_ids[i][j]] = 1

                    true_logits.append(tmp)
                    #print('true_logits: ', true_logits)

                tmp_eval_accuracy = accuracy_2d(flaw_logits, true_logits)
                eval_accuracy += tmp_eval_accuracy

                predictions += true_logits  # Original
                truths += flaw_logits  # Original
                #predictions += flaw_logits # for trouble-shooting
                #truths += true_logits # for trouble-shooting
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

                with open(output_file, "a") as csv_file:
                    for i in range(len(label_id)):
                        #print("i in write output file:",i)
                        token = ' '.join(
                            [i2w[x] for x in token_ids[i] if x != 0])
                        flaw_logit = flaw_logits[i]
                        #print("flaw_logit in write output file: ",flaw_logit)
                        label = str(label_id[i])
                        logit = ','.join([
                            str(i) for i, x in enumerate(flaw_logit) if x == 1
                        ])  # for trouble-shooting
                        logit = '-1' if logit == '' else logit  # for trouble-shooting
                        writer = csv.writer(csv_file, delimiter='\t')
                        writer.writerow([token, label, logit])

                # Renaming and moving the file for Embedding Estimator

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_steps
            eval_f1_score, eval_recall_score, eval_precision_score = f1_2d(
                truths, predictions)
            loss = tr_loss / nb_tr_steps if args.do_train else None
            result = {
                'eval_loss': eval_loss,
                'eval_f1': eval_f1_score,
                'eval_recall': eval_recall_score,
                'eval_precision': eval_precision_score,
                'eval_acc': eval_accuracy
            }

            output_eval_file = os.path.join(
                args.output_dir,
                "disc_eval_results_" + attack_type + "_attacks.txt")
            with open(output_eval_file, "a") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            #attack_type='drop'
            new_path = os.path.join(
                args.data_dir, "disc_eval_outputs_" + attack_type + ".tsv")
            current_path = os.path.join(
                args.data_dir, "epoch" + str(epoch) + "disc_eval_outputs_" +
                attack_type + ".tsv")
            os.rename(current_path, new_path)
Example #6
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--word_embedding_file",
                        default='emb/crawl-300d-2M.vec',
                        type=str,
                        help="The input directory of word embeddings.")
    parser.add_argument("--index_path",
                        default='emb/p_index.bin',
                        type=str,
                        help="The input directory of word embedding index.")
    parser.add_argument("--word_embedding_info",
                        default='emb/vocab_info.txt',
                        type=str,
                        help="The input directory of word embedding info.")
    parser.add_argument("--data_file",
                        default='',
                        type=str,
                        help="The input directory of input data file.")

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--max_ngram_length",
                        default=16,
                        type=int,
                        help="The maximum total ngram sequence")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--embedding_size",
                        default=300,
                        type=int,
                        help="Total batch size for embeddings.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--num_eval_epochs',
        type=int,
        default=0,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--single',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    logger.info("loading embeddings ... ")
    if args.do_train:
        emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(
            args.word_embedding_file)
        write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list)
    if args.do_eval:
        emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info)
        #emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(args.word_embedding_file)
        #write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list)
    logger.info("loading p index ...")
    if not os.path.exists(args.index_path):
        p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec,
                                           args.index_path)
    else:
        p = load_embedding_index(args.index_path,
                                 emb_vocab_size,
                                 num_dim=args.embedding_size)

    train_examples = None
    num_train_optimization_steps = None
    w2i, i2w, vocab_size = {}, {}, 1
    if args.do_train:

        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

        train_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_train(\
            train_examples, label_list, args.max_seq_length, args.max_ngram_length, tokenizer, emb_dict)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_ngram_ids = torch.tensor([f.ngram_ids for f in train_features],
                                     dtype=torch.long)
        all_ngram_labels = torch.tensor(
            [f.ngram_labels for f in train_features], dtype=torch.long)
        all_ngram_masks = torch.tensor([f.ngram_masks for f in train_features],
                                       dtype=torch.long)
        all_ngram_embeddings = torch.tensor(
            [f.ngram_embeddings for f in train_features], dtype=torch.float)

        # Prepare model
        cache_dir = args.cache_dir if args.cache_dir else os.path.join(
            PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
                args.local_rank))
        model = BertForNgramClassification.from_pretrained(
            args.bert_model,
            cache_dir=cache_dir,
            num_labels=num_labels,
            embedding_size=args.embedding_size,
            max_seq_length=args.max_seq_length,
            max_ngram_length=args.max_ngram_length)
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0

        #if args.do_train:

        train_data = TensorDataset(all_ngram_ids, all_ngram_labels,
                                   all_ngram_masks, all_ngram_embeddings)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ind in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0

            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                ngram_ids, ngram_labels, ngram_masks, ngram_embeddings = batch
                loss = model(ngram_ids, ngram_masks, ngram_embeddings)
                if n_gpu > 1:
                    loss = loss.mean()

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()

                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:

                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            loss = tr_loss / nb_tr_steps if args.do_train else None
            result = {
                'loss': loss,
            }

            output_eval_file = os.path.join(args.output_dir,
                                            "train_results.txt")
            with open(output_eval_file, "a") as writer:
                #logger.info("***** Training results *****")
                writer.write("epoch" + str(ind) + '\n')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('\n')

            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir,
                                             "epoch" + str(ind) + WEIGHTS_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

    # Load a trained model and config that you have fine-tuned
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):

        eval_examples = processor.get_gnrt_dev_examples(args.data_file)
        eval_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_eval(
            eval_examples, label_list, args.max_seq_length,
            args.max_ngram_length, tokenizer, w2i, i2w, vocab_size)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Num token vocab = %d", vocab_size)
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_token_ids = torch.tensor([f.token_ids for f in eval_features],
                                     dtype=torch.long)
        # all_flaw_labels: indexes of wrong words predicted by disc
        all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features],
                                       dtype=torch.long)
        all_ngram_ids = torch.tensor([f.ngram_ids for f in eval_features],
                                     dtype=torch.long)
        all_ngram_mask = torch.tensor([f.ngram_mask for f in eval_features],
                                      dtype=torch.long)
        all_ngram_labels = torch.tensor(
            [f.ngram_labels for f in eval_features], dtype=torch.long)
        all_label_id = torch.tensor([f.label_id for f in eval_features],
                                    dtype=torch.long)

        eval_data = TensorDataset(all_token_ids, all_ngram_ids, all_ngram_mask,
                                  all_ngram_labels, all_label_id,
                                  all_flaw_labels)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        if args.single:
            eval_range = trange(int(args.num_eval_epochs),
                                int(args.num_eval_epochs + 1),
                                desc="Epoch")
        else:
            eval_range = trange(int(args.num_eval_epochs), desc="Epoch")

        for epoch in eval_range:

            output_file = os.path.join(
                args.data_dir, "epoch" + str(epoch) + "gnrt_outputs.tsv")
            with open(output_file, "w") as csv_file:
                writer = csv.writer(csv_file, delimiter='\t')
                writer.writerow(["sentence", "label"])

            output_model_file = os.path.join(
                args.output_dir, "epoch" + str(epoch) + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            config = BertConfig(output_config_file)
            model = BertForNgramClassification(
                config,
                num_labels=num_labels,
                embedding_size=args.embedding_size,
                max_seq_length=args.max_seq_length,
                max_ngram_length=args.max_ngram_length)
            model.load_state_dict(torch.load(output_model_file))
            model.to(device)
            model.eval()

            for token_ids, ngram_ids, ngram_mask, ngram_labels, label_id, flaw_labels in tqdm(
                    eval_dataloader, desc="Evaluating"):

                ngram_ids = ngram_ids.to(device)
                ngram_mask = ngram_mask.to(device)

                with torch.no_grad():
                    logits = model(ngram_ids, ngram_mask)

                logits = logits.detach().cpu().numpy()
                flaw_labels = flaw_labels.to('cpu').numpy()
                label_id = label_id.to('cpu').numpy()
                token_ids = token_ids.to('cpu').numpy()
                masks = ngram_mask.to('cpu').numpy()

                with open(output_file, "a") as csv_file:

                    for i in range(len(label_id)):

                        correct_tokens = look_up_words(logits[i], masks[i],
                                                       vocab_list, p)
                        token_new = replace_token(token_ids[i], flaw_labels[i],
                                                  correct_tokens, i2w)
                        token_new = ' '.join(token_new)
                        label = str(label_id[i])
                        writer = csv.writer(csv_file, delimiter='\t')
                        writer.writerow([token_new, label])
Example #7
0
def train(train_batch_size,
          roberta_model,
          hidden_size=768,
          learning_rate=3e-5,
          warmup_proportion=0.1,
          seed=23):
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    print(device)
    n_gpu = torch.cuda.device_count()
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
    print('loading train features.')
    with open('preprocess/trainFeatures.pkl', 'rb') as f:
        trainFeatures = pickle.load(f)
    print('train features have been loaded.')
    nums = len(trainFeatures)
    print('训练集大小:', nums)
    num_train_optimization_steps = 4 * int(
        len(trainFeatures) / train_batch_size)
    model = basemodel(roberta_model, hidden_size)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    param_optimizer = list(model.named_parameters())
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in trainFeatures],
                                 dtype=torch.long)
    all_input_masks = torch.tensor([f.input_mask for f in trainFeatures],
                                   dtype=torch.long)
    all_start_positions = torch.tensor(
        [f.start_position for f in trainFeatures], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in trainFeatures],
                                     dtype=torch.long)
    all_answer_choices = torch.tensor([f.ans_choice for f in trainFeatures],
                                      dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_masks,
                               all_start_positions, all_end_positions,
                               all_answer_choices)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=train_batch_size)
    model.train()
    print('training')
    for epoch in range(4):
        for step, batch in enumerate(
                tqdm(train_dataloader, desc='Iteration', disable=False)):
            if n_gpu == 1:
                batch = tuple(t.to(device) for t in batch)
            input_ids, input_masks, start_positions, end_positions, answer_choices = batch
            loss = model(input_ids, input_masks, start_positions,
                         end_positions, answer_choices)
            if n_gpu > 1:
                loss = loss.mean()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            #print(loss)
            #model_to_save = model.module if hasattr(model, 'module') else model
            if (step % (int(nums / train_batch_size) // 3) == 0):
                os.mkdir('model' + '/' + str(epoch) + '_' + str(step))
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    'model' + '/' + str(epoch) + '_' + str(step),
                    'pytorch_model.bin')
                output_config_file = os.path.join(
                    'model' + '/' + str(epoch) + '_' + str(step),
                    'config.json')
                output_m_file = os.path.join(
                    'model' + '/' + str(epoch) + '_' + str(step), 'model.pt')
                torch.save(model_to_save.state_dict(), output_m_file)
                torch.save(model_to_save.roberta.state_dict(),
                           output_model_file)
                model_to_save.roberta.config.to_json_file(output_config_file)
Example #8
0
def train(train_iter, test_iter, config):
    """"""
    # Prepare model
    # Prepare model
    # reload weights from restore_file if specified
    if config.pretrainning_model == 'nezha':
        Bert_config = BertConfig.from_json_file(config.bert_config_file)
        model = BertForQA(config=Bert_config, params=config)
        nezha_utils.torch_init_model(model, config.bert_file)
    elif config.pretrainning_model == 'albert':
        Bert_config = AlbertConfig.from_pretrained(config.model_path)
        model = BertForQA.from_pretrained(config.model_path,
                                          config=Bert_config)
    else:
        Bert_config = RobertaConfig.from_pretrained(config.bert_config_file,
                                                    output_hidden_states=True)
        model = BertForQA.from_pretrained(
            config=Bert_config,
            params=config,
            pretrained_model_name_or_path=config.model_path)

    if config.restore_file is not None:
        logging.info("Restoring parameters from {}".format(
            config.restore_file))
        # 读取checkpoint
        model, optimizer = load_checkpoint(config.restore_file)
    model.to(device)
    """多卡训练"""
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    # optimizer
    # Prepare optimizer
    # fine-tuning
    # 取模型权重
    param_optimizer = list(model.named_parameters())
    # pretrain model param
    param_pre = [(n, p) for n, p in param_optimizer
                 if 'bert' in n or 'electra' in n]  # nezha的命名为bert
    # middle model param
    param_middle = [
        (n, p) for n, p in param_optimizer
        if not any([s in n for s in ('bert', 'crf', 'electra',
                                     'albert')]) or 'dym_weight' in n
    ]
    # crf param
    # 不进行衰减的权重
    no_decay = ['bias', 'LayerNorm', 'dym_weight', 'layer_norm']
    # 将权重分组
    optimizer_grouped_parameters = [
        # pretrain model param
        # 衰减
        {
            'params':
            [p for n, p in param_pre if not any(nd in n for nd in no_decay)],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.embed_learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_pre if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
            'lr': config.embed_learning_rate
        },
        # middle model
        # 衰减
        {
            'params': [
                p for n, p in param_middle
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_middle if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0,
            'lr':
            config.learning_rate
        },
    ]
    num_train_optimization_steps = train_iter.num_records // config.gradient_accumulation_steps * config.train_epoch
    optimizer = BertAdam(optimizer_grouped_parameters,
                         warmup=config.warmup_proportion,
                         schedule="warmup_linear",
                         t_total=num_train_optimization_steps)
    logger.info("***** Running training *****")
    # logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", config.batch_size)
    logger.info("  Num epochs = %d", config.train_epoch)
    logger.info("  Learning rate = %f", config.learning_rate)

    cum_step = 0
    best_acc = 0.0
    timestamp = str(int(time.time()))
    if device != 'cpu':
        out_dir = os.path.abspath(
            os.path.join(config.save_model, "runs_" + str(gpu_id), timestamp))
    if device == 'cpu':
        out_dir = os.path.abspath(
            os.path.join(config.save_model, "runs_" + str('cpu_0'), timestamp))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print("Writing to {}\n".format(out_dir))
    num_parameters = sum(torch.numel(param) for param in model.parameters())
    print('total number of model parameters', num_parameters)

    for i in range(config.train_epoch):
        model.train()
        for input_ids_list, input_mask_list, segment_ids_list, start_list, end_list, uid_list, \
            answer_list, text_list, querylen_list, mapping_list, cls_list in tqdm(
            train_iter):
            # 转成张量
            loss = model(list2ts2device(input_ids_list),
                         list2ts2device(input_mask_list),
                         list2ts2device(segment_ids_list),
                         list2ts2device(start_list), list2ts2device(end_list),
                         list2ts2device(cls_list))
            # if n_gpu > 1:
            #     loss = loss.mean()  # mean() to average on multi-gpu.
            # 梯度累加
            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps
            if cum_step % 10 == 0:
                format_str = 'step {}, loss {:.4f} lr {:.5f}'
                print(format_str.format(cum_step, loss, config.learning_rate))
            if config.flooding:
                loss = (loss - config.flooding
                        ).abs() + config.flooding  # 让loss趋于某个值收敛
            loss.backward()  # 反向传播,得到正常的grad
            if (cum_step + 1) % config.gradient_accumulation_steps == 0:
                # performs updates using calculated gradients
                optimizer.step()
                model.zero_grad()
            cum_step += 1
        acc = set_test(model, test_iter, epoch=i)
        # lr_scheduler学习率递减 step
        print('dev set : step_{},ACC_{}'.format(cum_step, acc))
        if acc > best_acc:
            # Save a trained model
            best_acc = acc
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(
                os.path.join(out_dir,
                             'model_{:.4f}_{}.bin'.format(acc, str(cum_step))))
            torch.save(model_to_save, output_model_file)
Example #9
0
def train(args):
    processor = data_utils.ABSAProcessor()
    label_list = processor.get_labels('absa')
    model = ABSABert.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))
    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    model.cuda()

    epoch_dataset = PregeneratedDataset(epoch=0,
                                        training_path=args.data_dir,
                                        tokenizer=tokenizer,
                                        num_data_epochs=1)
    unlabel_train_sampler = RandomSampler(epoch_dataset)
    unlabel_train_dataloader = DataLoader(epoch_dataset,
                                          sampler=unlabel_train_sampler,
                                          batch_size=args.train_batch_size)
    unlabel_iter = iter(unlabel_train_dataloader)
    train_steps = len(unlabel_train_dataloader)

    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = train_steps * args.num_train_epochs
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    model.train()
    total_loss = 0
    for e_ in range(args.num_train_epochs):
        if e_ > 0:
            epoch_dataset = PregeneratedDataset(epoch=e_,
                                                training_path=args.data_dir,
                                                tokenizer=tokenizer,
                                                num_data_epochs=1)
            unlabel_train_sampler = RandomSampler(epoch_dataset)
            unlabel_train_dataloader = DataLoader(
                epoch_dataset,
                sampler=unlabel_train_sampler,
                batch_size=args.train_batch_size)
            unlabel_iter = iter(unlabel_train_dataloader)
            train_steps = len(unlabel_train_dataloader)
            logger.info('unlabel data number:{} steps:{}'.format(
                len(epoch_dataset), train_steps))

        for step in range(train_steps):
            batch = unlabel_iter.next()
            batch = tuple(t.cuda() for t in batch)
            input_ids, tag_ids, head_tokens_index, rel_label_ids, input_mask, lm_label_ids, tag_label_ids, _ = batch
            loss = model(input_ids,
                         input_tags=tag_ids,
                         head_tokens_index=head_tokens_index,
                         dep_relation_label=rel_label_ids,
                         masked_tag_labels=tag_label_ids,
                         attention_mask=input_mask)

            loss.backward()
            total_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()

            global_step += 1
            if global_step % 100 == 0:
                logger.info('in step {} loss is: {}'.format(
                    global_step, total_loss / global_step))
            # >>>> perform validation at the end of each epoch .
        os.makedirs(args.output_dir + '/epoch' + str(e_), exist_ok=True)
        torch.save(
            model.state_dict(),
            os.path.join(args.output_dir + '/epoch' + str(e_), "model.pt"))
Example #10
0
class MIL:
    def __init__(self, args):
        self.args = args
        self.raw_data = Raw_dataset(args.flickerfile, args.label_list)
        self.transformer = get_transformer(network=args.network)
        self.evaluator = evaluator(self.raw_data.label_list)
        shuffle = False
        if args.mode != 'test':
            shuffle = True

        self.model = ActionClassifier(self.raw_data.get_num_of_labels(),
                                      head=args.head,
                                      pos_dim=4,
                                      hidden_size=512,
                                      is_tsv_feat=False,
                                      in_channels=args.num_boxes)
        self.epoch = args.epochs
        self.save_epoch = args.save_epoch
        self.lr = args.learning_rate

        if args.mode == 'train' or args.mode == 'small_data':
            self.data_set = MIL_dataset(self.raw_data,
                                        self.transformer,
                                        img_path=args.img_root_dir,
                                        tsv_path=args.tsv_path,
                                        mode=args.mode,
                                        use_tsv=False,
                                        num_boxes=args.num_boxes)
            self.data_loader = DataLoader(self.data_set,
                                          batch_size=args.batch_size,
                                          shuffle=shuffle,
                                          num_workers=args.num_workers)
            #params = list(self.decoder.parameters())
            #params.extend(list(self.encoder.parameters()))

            batch_per_epoch = len(self.data_loader)
            t_total = int(batch_per_epoch * args.epochs)

        if args.mode == 'dev':
            self.eval_set = MIL_dataset(self.raw_data,
                                        self.transformer,
                                        img_path=args.img_root_dir,
                                        tsv_path=args.tsv_path,
                                        mode=args.mode,
                                        use_tsv=False,
                                        num_boxes=args.num_boxes)
            self.eval_loader = DataLoader(self.eval_set,
                                          batch_size=args.batch_size,
                                          shuffle=shuffle,
                                          num_workers=args.num_workers)

        self.st_epoch = 0
        if (args.load != None):
            fname = os.path.join(args.model_dir, args.load)
            self.st_epoch = self.load_model(fname)

        self.model.cuda()
        if args.multiGPU:
            self.model = nn.DataParallel(self.model)

        self.criterion = Mixture_loss(args.head)
        #self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.05, momentum=0.9, weight_decay = 0.1)

        if args.mode == 'train':
            batch_per_epoch = len(self.data_loader)
            print('batch per epoch ={}'.format(batch_per_epoch))
            t_total = int(batch_per_epoch * args.epochs)
            print('total iterations== {} ; warmup start = {}'.format(
                t_total, t_total * args.wstep))
            self.optimizer = BertAdam(
                list(self.model.parameters()),
                lr=args.learning_rate,
                warmup=args.wstep,
                t_total=t_total)  #changing warmup from 0.1 to 0.3

    def train(self):
        print('training started')
        self.model.train()
        for epoch in range(self.epoch):
            tr_loss = 0
            nb_tr_steps = 0
            em_loss_t = 0
            cls_loss_t = 0
            for imgs, subimgs, boxes, interaction_pattern, label_hot_vec in tqdm(
                    self.data_loader):
                self.optimizer.zero_grad()
                imgs, subimgs, boxes, interaction_pattern, label_hot_vec = imgs.cuda(
                ), subimgs.cuda(), boxes.cuda(), interaction_pattern.cuda(
                ), label_hot_vec.cuda()
                g_x, p_yi_x = self.model(imgs, subimgs, boxes,
                                         interaction_pattern, label_hot_vec)

                loss, em_loss, class_loss = self.criterion(
                    g_x, p_yi_x,
                    label_hot_vec)  #(h_x, g_x, p_y_x, p_yi_x,label_hot_vec)

                if self.args.multiGPU:
                    loss = loss.mean()

                tr_loss += loss.item()
                em_loss_t += em_loss
                cls_loss_t += class_loss

                nb_tr_steps += 1
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), 1.)
                #nn.utils.clip_grad_norm_(self.decoder.parameters(), 1.)
                self.optimizer.step()

            print(
                "Train loss@epoch {}: total:{} emloss:{}, cls_loss:{}".format(
                    self.st_epoch + epoch + 1, tr_loss / nb_tr_steps,
                    em_loss_t / nb_tr_steps, cls_loss_t / nb_tr_steps))
            if epoch == self.epoch - 1 or (epoch + 1) % self.save_epoch == 0:
                filename = 'pascal_voc' + str(self.st_epoch + epoch +
                                              1) + '.model'
                filename = os.path.join(args.model_dir, filename)
                self.save_model(filename, self.st_epoch + epoch + 1)

    def evaluate(self, thresold=0.5):
        print('evaluation started')
        self.model.eval()
        res = []
        gold = []
        for imgs, subimgs, boxes, interaction_pattern, label_hot_vec in tqdm(
                self.eval_loader):
            imgs, subimgs, boxes, interaction_pattern = imgs.cuda(
            ), subimgs.cuda(), boxes.cuda(), interaction_pattern.cuda()
            g_x, p_yi_x = self.model(imgs, subimgs, boxes, interaction_pattern)
            g_x = g_x.unsqueeze(dim=-1)
            class_prob = torch.bernoulli(p_yi_x) * g_x
            class_prob = torch.sum(class_prob, dim=1) - thresold
            class_prob = class_prob.cpu()
            target = [
                torch.nonzero(t).squeeze(-1).numpy() for t in label_hot_vec
            ]
            pred = [(t > 0).nonzero().squeeze(-1).numpy() for t in class_prob]

            for gs in target:
                gold.append(gs)

            for pd in pred:
                res.append(pd)

        self.evaluator.evaluate(res, gold, self.args.dump)

    def evaluate2(self, thresold=0.5):
        print('evaluation started')
        self.model.eval()
        res = []
        gold = []
        for imgs, subimgs, boxes, interaction_pattern, label_hot_vec in tqdm(
                self.eval_loader):
            imgs, subimgs, boxes, interaction_pattern = imgs.cuda(
            ), subimgs.cuda(), boxes.cuda(), interaction_pattern.cuda()
            g_x, p_yi_x = self.model(imgs, subimgs, boxes, interaction_pattern)
            g_x = g_x.unsqueeze(dim=-1)
            #class_prob = torch.bernoulli(p_yi_x) * g_x
            #class_prob = torch.sum(class_prob,dim=1) - thresold
            #class_prob = class_prob.cpu()
            target = [
                torch.nonzero(t).squeeze(-1).numpy() for t in label_hot_vec
            ]
            pred = (p_yi_x >= thresold).float() * 1
            pred = torch.prod(pred, dim=1)
            #pred = torch.sum(pred, dim =1)

            pred1 = [(t > 0).nonzero().squeeze(-1).numpy() for t in pred]
            #pred1 = [(t>9).nonzero().squeeze(-1).numpy() for t in pred] #atleast 9 distribution among 16 says yes
            for gs in target:
                gold.append(gs)

            for pd in pred1:
                res.append(pd)

        self.evaluator.evaluate(res, gold, self.args.dump)

    def save_model(self, name, epoch):
        #epoch = self.epoch
        lr = self.lr
        check_point = {}
        check_point['model'] = self.model.state_dict()
        #check_point['decoder'] = self.decoder.state_dict()
        check_point['epoch'] = epoch
        check_point['lr'] = lr
        check_point['optimizer'] = None
        torch.save(check_point, name)
        print('model saved at {}'.format(name))

    def load_model(self, path):
        print("Load model from %s" % path)
        check_point = torch.load(path)
        model_dict = check_point['model']
        #decoder_dict = check_point['decoder']

        self.model.load_state_dict(model_dict, strict=False)
        #self.decoder.load_state_dict(decoder_dict, strict=False)
        if check_point['optimizer'] != None:
            self.optimizer = optimizer

        return check_point['epoch']
Example #11
0
def train(args):
    processor = data_utils.ABSAProcessor()
    label_list = processor.get_labels(args.task_type)
    model = BertForTokenClassification.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))

    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    train_examples = processor.get_train_examples(args.data_dir,
                                                  args.task_type)
    num_train_steps = int(
        math.ceil(len(train_examples) /
                  args.train_batch_size)) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    # all_tag_ids = torch.tensor([f.tag_id for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.do_valid:
        valid_examples = processor.get_dev_examples(args.data_dir,
                                                    args.task_type)
        valid_features = data_utils.convert_examples_to_features(
            valid_examples, label_list, args.max_seq_length, tokenizer)
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)
        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask, valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []

    model.cuda()

    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps  # num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    model.train()

    train_steps = len(train_dataloader)
    for e_ in range(args.num_train_epochs):
        train_iter = iter(train_dataloader)
        for step in range(train_steps):
            batch = train_iter.next()
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, label_ids = batch
            loss = model(input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, label_ids = batch
                    loss = model(input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_mask,
                                 labels=label_ids)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f", valid_loss)
                valid_losses.append(valid_loss)

            if valid_loss < best_valid_loss:
                torch.save(model, os.path.join(args.output_dir, "model.pt"))
                best_valid_loss = valid_loss
            model.train()

    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model, os.path.join(args.output_dir, "model.pt"))
Example #12
0
def train(args):

    processor = data_utils.AscProcessor()
    label_list = processor.get_labels()
    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer, "asc")
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_label_ids)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    #>>>>> validation
    if args.do_valid:
        valid_examples = processor.get_dev_examples(args.data_dir)
        valid_features = data_utils.convert_examples_to_features(
            valid_examples, label_list, args.max_seq_length, tokenizer, "asc")
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)
        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask, valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []
    #<<<<< end of validation declaration

    model = BertForABSA.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))
    model.cuda()
    # Prepare optimizer
    param_optimizer = [(k, v) for k, v in model.named_parameters()
                       if v.requires_grad == True]
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    model.train()

    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, label_ids = batch
            optimizer.zero_grad()
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            loss.backward()

            lr_this_step = args.learning_rate * warmup_linear(
                global_step / t_total, args.warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            global_step += 1
        print("training loss: ", loss.item(), epoch + 1)
        #>>>> perform validation at the end of each epoch.
        new_dirs = os.path.join(args.output_dir, str(epoch + 1))
        os.mkdir(new_dirs)
        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, label_ids = batch
                    loss = model(input_ids, segment_ids, input_mask, label_ids)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f, epoch: %d", valid_loss,
                            epoch + 1)
                valid_losses.append(valid_loss)
                torch.save(model, os.path.join(new_dirs, "model.pt"))
                test(args, new_dirs, dev_as_test=True)
                if epoch == args.num_train_epochs - 1:
                    torch.save(model, os.path.join(args.output_dir,
                                                   "model.pt"))
                    test(args, args.output_dir, dev_as_test=False)
                os.remove(os.path.join(new_dirs, "model.pt"))
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
            model.train()
    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model, os.path.join(args.output_dir, "model.pt"))
Example #13
0
def train(args):
    processor = data_utils.ABSAProcessor()
    label_list = processor.get_labels(args.task_type)

    tokenizer = ABSATokenizer.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    train_examples = processor.get_train_examples(args.data_dir,
                                                  args.task_type)
    num_train_steps = int(
        math.ceil(len(train_examples) /
                  args.train_batch_size)) * args.num_train_epochs

    train_features = data_utils.convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)

    domain_dataset = PregeneratedDataset(epoch=0,
                                         training_path=args.domain_dataset,
                                         tokenizer=tokenizer,
                                         num_data_epochs=1)

    domain_train_sampler = RandomSampler(domain_dataset)
    domain_train_dataloader = DataLoader(domain_dataset,
                                         sampler=domain_train_sampler,
                                         batch_size=16)

    train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                               all_label_ids)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # >>>>> validation
    if args.do_valid:
        valid_examples = processor.get_dev_examples(args.data_dir,
                                                    args.task_type)
        valid_features = data_utils.convert_examples_to_features(
            valid_examples, label_list, args.max_seq_length, tokenizer)
        valid_all_input_ids = torch.tensor(
            [f.input_ids for f in valid_features], dtype=torch.long)
        valid_all_segment_ids = torch.tensor(
            [f.segment_ids for f in valid_features], dtype=torch.long)
        valid_all_input_mask = torch.tensor(
            [f.input_mask for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)
        # valid_all_tag_ids = torch.tensor([f.tag_id for f in valid_features], dtype=torch.long)
        valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids,
                                   valid_all_input_mask, valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        best_valid_loss = float('inf')
        valid_losses = []

    # <<<<< end of validation declaration
    model = ABSABert.from_pretrained(
        modelconfig.MODEL_ARCHIVE_MAP[args.bert_model],
        num_labels=len(label_list))

    if args.features_model != 'none':
        state_dict = torch.load(args.features_model)
        del state_dict['classifier.weight']
        del state_dict['classifier.bias']
        model.load_state_dict(state_dict, strict=False)
        logger.info('load fine-tuned model from : {}'.format(
            args.features_model))

    model.cuda()

    flag = True
    if flag:
        # bert-base
        shared_param_optimizer = [(k, v)
                                  for k, v in model.bert.named_parameters()
                                  if v.requires_grad == True]
        shared_param_optimizer = [
            n for n in shared_param_optimizer if 'pooler' not in n[0]
        ]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        shared_optimizer_grouped_parameters = [{
            'params': [
                p for n, p in shared_param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in shared_param_optimizer
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        t_total = num_train_steps  # num_train_steps
        supervised_param_optimizer = model.classifier.parameters()

        domain_classifier_param_optimizer = model.domain_cls.parameters()

        shared_optimizer = BertAdam(shared_optimizer_grouped_parameters,
                                    lr=args.learning_rate,
                                    warmup=args.warmup_proportion,
                                    t_total=t_total)

        supervised_optimizer = BertAdam(supervised_param_optimizer,
                                        lr=args.learning_rate,
                                        warmup=args.warmup_proportion,
                                        t_total=t_total)

        domain_optimizer = BertAdam(domain_classifier_param_optimizer,
                                    lr=3e-5,
                                    warmup=args.warmup_proportion,
                                    t_total=-1)
    else:
        param_optimizer = [(k, v) for k, v in model.named_parameters()
                           if v.requires_grad == True]
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        t_total = num_train_steps  # num_train_steps
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    model.train()

    train_steps = len(train_dataloader)
    total_domain_loss = 0
    for e_ in range(args.num_train_epochs):
        train_iter = iter(train_dataloader)
        domain_iter = iter(domain_train_dataloader)
        for step in range(train_steps):
            batch = train_iter.next()
            batch = tuple(t.cuda() for t in batch)
            input_ids, segment_ids, input_mask, label_ids = batch  # all_input_ids, all_segment_ids, all_input_mask, all_label_ids, all_tag_ids
            loss, _ = model(input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)

            loss.backward()
            if flag:
                shared_optimizer.step()
                shared_optimizer.zero_grad()
                supervised_optimizer.step()
                supervised_optimizer.zero_grad()
            else:
                optimizer.step()
                optimizer.zero_grad()

            dirt_n = 1  # 1 or 2
            for _ in range(dirt_n):
                try:
                    batch = domain_iter.next()
                except:
                    domain_iter = iter(domain_train_dataloader)
                    batch = domain_iter.next()
                batch = tuple(t.cuda() for t in batch)
                input_ids, input_mask, domain_labels = batch[0], batch[
                    4], batch[-1]
                d_loss = model(input_ids,
                               attention_mask=input_mask,
                               domain_label=domain_labels)
                d_loss.backward()
                total_domain_loss += d_loss.item()

                domain_optimizer.step()
                domain_optimizer.zero_grad()
                shared_optimizer.zero_grad(
                )  # make sure to clear the gradients of encoder.

            if step % 50 == 0:
                logger.info('in step {} domain loss: {}'.format(
                    dirt_n * (e_ * train_steps + step + 1), total_domain_loss /
                    (dirt_n * (e_ * train_steps + step + 1))))

            global_step += 1
            # >>>> perform validation at the end of each epoch .

        if args.do_valid:
            model.eval()
            with torch.no_grad():
                losses = []
                valid_size = 0
                for step, batch in enumerate(valid_dataloader):
                    batch = tuple(
                        t.cuda()
                        for t in batch)  # multi-gpu does scattering it-self
                    input_ids, segment_ids, input_mask, label_ids = batch
                    loss, _ = model(input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                    loss = torch.mean(loss)
                    losses.append(loss.data.item() * input_ids.size(0))
                    valid_size += input_ids.size(0)
                valid_loss = sum(losses) / valid_size
                logger.info("validation loss: %f", valid_loss)
                valid_losses.append(valid_loss)

            if valid_loss < best_valid_loss:
                torch.save(model.state_dict(),
                           os.path.join(args.output_dir, "model.pt"))
                best_valid_loss = valid_loss
            model.train()

    if args.do_valid:
        with open(os.path.join(args.output_dir, "valid.json"), "w") as fw:
            json.dump({"valid_losses": valid_losses}, fw)
    else:
        torch.save(model.state_dict(), os.path.join(args.output_dir,
                                                    "model.pt"))
Example #14
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The train file path")
    parser.add_argument("--eval_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The dev file path")
    parser.add_argument("--eval_train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The train  eval file path")
    parser.add_argument("--predict_file",
                        default=None,
                        type=str,
                        required=False,
                        help="The predict file path")
    parser.add_argument("--top_n",
                        default=5,
                        type=float,
                        required=True,
                        help="higher than threshold is classify 1,")
    parser.add_argument("--bert_config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                        "This specifies the model architecture.")
    parser.add_argument("--bert_model",
                        default=None,
                        type=str,
                        required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                        "This specifies the model architecture.")
    parser.add_argument("--result_file",
                        default=None,
                        type=str,
                        required=False,
                        help="The result file that the BERT model was trained on.")
    parser.add_argument("--vocab_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")
    # Other parameters
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Whether to lower case the input text.")
    parser.add_argument("--max_seq_length",
                        default=180,
                        type=int,
                        help="maximum total input sequence length after WordPiece tokenization.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--num_labels", default=1, type=int, help="mapping classify nums")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=6.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--reduce_dim",
                        default=64,
                        type=int,
                        required=False,
                        help="from hidden size to this dimensions, reduce dim")
    parser.add_argument("--gpu0_size",
                        default=1,
                        type=int,
                        help="maximum total input sequence length after WordPiece tokenization.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float,
                        default=128,
                        help='Loss scale, positive power of 2 can improve fp16 convergence.')

    args = parser.parse_args()

    data_processor = DataProcessor(args.num_labels)
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info("16-bits training currently not supported in distributed training")
            args.fp16 = False    # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    print(f'args.train_batch_size = {args.train_batch_size}')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not any([args.do_train, args.do_predict, args.do_eval]):
        raise ValueError("At least one of `do_train` or `do_eval`  or `do_predict` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    bert_config.reduce_dim = args.reduce_dim

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(
            args.output_dir))

    if args.do_train:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    def prepare_data(args, task_name='train'):
        if task_name == 'train':
            file_path = args.train_file
        elif task_name == 'eval':
            file_path = args.eval_file
        elif task_name == 'train_eval':
            file_path = args.eval_train_file

        if os.path.isdir(file_path):
            examples = data_processor.read_file_dir(file_path, top_n=args.top_n)
        else:
            examples, example_map_ids = data_processor.read_novel_examples(file_path,
                                                                           top_n=args.top_n,
                                                                           task_name=task_name)
        features = convert_examples_to_features(examples, args.max_seq_length, tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_example_ids = torch.tensor([f.example_id for f in features], dtype=torch.long)

        if task_name in ['train', 'eval', 'train_eval']:
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
            datas = TensorDataset(all_example_ids, all_input_ids, all_input_mask, all_segment_ids,
                                  all_label_ids)
        else:
            datas = TensorDataset(all_example_ids, all_input_ids, all_input_mask, all_segment_ids)

        if task_name == 'train':
            if args.local_rank == -1:
                data_sampler = RandomSampler(datas)
            else:
                data_sampler = DistributedSampler(datas)
            dataloader = DataLoader(datas,
                                    sampler=data_sampler,
                                    batch_size=args.train_batch_size,
                                    drop_last=True)
        else:
            dataloader = DataLoader(datas, batch_size=args.eval_batch_size, drop_last=True)
        return (dataloader, example_map_ids) if task_name != 'train' else dataloader

    def accuracy(example_ids, logits, labels, probs=None, positive=False):

        if positive:
            # print(f'example_ids = {example_ids.shape}')
            # print(f'logits = {logits.shape}')
            # print(f'labels = {labels.shape}')
            # print(f'probs = {probs.shape}')

            logits = logits[labels > 0]
            example_ids = example_ids[labels > 0]
            probs = probs[labels > 0]
            labels = labels[labels > 0]

        if isinstance(logits, torch.Tensor):
            logits = logits.tolist()
        if isinstance(example_ids, torch.Tensor):
            example_ids = example_ids.tolist()
        if isinstance(labels, torch.Tensor):
            labels = labels.tolist()

        assert len(logits) == len(example_ids) == len(labels)

        classify_name = ['part_same', 'full_same'] if positive else ['dif', 'same']
        text_a, text_b, novel_names, persons = [], [], [], []
        for i in example_ids:
            example = example_map_ids[i]
            # labels.append(example.label)
            text_a.append("||".join(example.text_a))
            text_b.append("||".join(example.text_b))
            novel_names.append(example.name)
            persons.append(example.person)
        write_data = pd.DataFrame({
            "text_a": text_a,
            "text_b": text_b,
            "labels": labels,
            "logits": logits,
            "novel_names": novel_names,
            "persons": persons
        })
        write_data['yes_or_no'] = write_data['labels'] == write_data['logits']
        if probs is not None:
            if isinstance(probs, torch.Tensor):
                probs = probs.tolist()
            write_data['logits'] = probs
        # write_data.to_csv(os.path.join(args.output_dir, f'{positive}.csv'), index=False)
        assert len(labels) == len(logits)
        try:
            result = classification_report(labels, logits, target_names=classify_name)
        except Exception:
            result = 'label is not equal to 3'
        print(f'\n{result}')
        return result

    def eval_model(model, eval_dataloader, device):
        model.eval()
        eval_loss = 0
        all_first_logits, all_second_logits = [], []
        all_example_ids = []
        all_labels = []
        all_first_probs, all_sencond_probs = [], []
        for step, batch in enumerate(tqdm(eval_dataloader, desc="evaluating")):
            example_ids, input_ids, input_mask, segment_ids, label_ids = batch
            if not args.do_train and not args.do_eval:
                label_ids = None
            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, labels=label_ids)
                first_logits, second_logits = logits
                first_prob, first_logits = torch.max(logits[0], dim=1)
                second_prob, second_logits = torch.max(logits[1], dim=1)
                all_labels.append(label_ids)

                all_first_probs.append(first_prob)
                all_sencond_probs.append(second_prob)

                all_first_logits.append(first_logits)
                all_second_logits.append(second_logits)

                all_example_ids.append(example_ids)

                eval_loss += tmp_eval_loss.mean().item()

        all_first_logits = torch.cat(all_first_logits, dim=0)
        all_second_logits = torch.cat(all_second_logits, dim=0)

        all_first_probs = torch.cat(all_first_probs, dim=0)
        all_sencond_probs = torch.cat(all_sencond_probs, dim=0)

        all_labels = torch.cat(all_labels, dim=0)

        all_first_labels, all_second_labels = [
            label.view(-1) for label in torch.chunk(all_labels, dim=1, chunks=2)
        ]

        all_example_ids = torch.cat(all_example_ids, dim=0)

        accuracy(all_example_ids,
                 all_first_logits,
                 labels=all_first_labels,
                 probs=all_first_probs,
                 positive=False)
        accuracy(all_second_logits,
                 all_second_logits,
                 labels=all_second_labels,
                 probs=all_sencond_probs,
                 positive=True)
        eval_loss /= (step + 1)
        return eval_loss

    train_dataloader = None
    num_train_steps = None
    if args.do_train:
        train_dataloader = prepare_data(args, task_name='train')
        num_train_steps = int(
            len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs)
    model = ThreeCategoriesClassifier2(bert_config, num_labels=data_processor.num_labels)
    new_state_dict = model.state_dict()
    init_state_dict = torch.load(os.path.join(args.bert_model, 'pytorch_model.bin'))
    for k, v in init_state_dict.items():
        if k in new_state_dict:
            print(f'k in = {k} v in shape = {v.shape}')
            new_state_dict[k] = v
    model.load_state_dict(new_state_dict)

    if args.fp16:
        model.half()
    if args.do_predict or args.do_eval:
        model_path = os.path.join(args.output_dir, WEIGHTS_NAME)
        new_state_dict = torch.load(model_path)
        new_state_dict = dict([
            (k[7:], v) if k.startswith('module') else (k, v) for k, v in new_state_dict.items()
        ])
        model.load_state_dict(new_state_dict)
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        if args.gpu0_size > 0:
            model = BalancedDataParallel(args.gpu0_size, model, dim=0).to(device)
        else:
            model = torch.nn.DataParallel(model)

    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_())
                           for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_())
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if n not in no_decay],
        'weight_decay': 0.01
    }, {
        'params': [p for n, p in param_optimizer if n in no_decay],
        'weight_decay': 0.0
    }]
    eval_dataloader, example_map_ids = prepare_data(args, task_name='eval')
    if args.do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        # eval_loss = eval_model(model, eval_dataloader, device)
        # logger.info(f'初始开发集loss: {eval_loss}')

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            torch.cuda.empty_cache()
            model_save_path = os.path.join(args.output_dir, f"{WEIGHTS_NAME}.{epoch}")
            tr_loss = 0
            train_batch_count = 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="training")):
                _, input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean()
                if args.fp16 and args.loss_scale != 1.0:
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    model.zero_grad()
                train_batch_count += 1
            tr_loss /= train_batch_count
            eval_loss = eval_model(model, eval_dataloader, device)
            logger.info(
                f'训练loss: {tr_loss}, 开发集loss:{eval_loss} 训练轮数:{epoch + 1}/{int(args.num_train_epochs)}'
            )
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model.state_dict(), model_save_path)
            if epoch == 0:
                model_to_save.config.to_json_file(output_config_file)
                tokenizer.save_vocabulary(args.output_dir)
    elif args.do_eval:
        eval_model(model, eval_dataloader, device)

    if args.do_predict:
        eval_model(model, eval_dataloader, device)
def train(train_iter, test_iter, config):
    """"""
    # Prepare model
    # Prepare model
    # reload weights from restore_file if specified  如果指定就加载已经训练的权重
    if config.pretrainning_model == 'nezha':  #哪吒模型
        Bert_config = BertConfig.from_json_file(config.bert_config_file)
        model = BertForTokenClassification(config=Bert_config, params=config)
        nezha_utils.torch_init_model(model, config.bert_file)
    elif config.pretrainning_model == 'albert':
        Bert_config = AlbertConfig.from_pretrained(config.model_path)
        model = BertForTokenClassification.from_pretrained(config.model_path,
                                                           config=Bert_config)
    else:
        Bert_config = RobertaConfig.from_pretrained(config.bert_config_file,
                                                    output_hidden_states=True)
        model = BertForTokenClassification.from_pretrained(
            config=Bert_config,
            params=config,
            pretrained_model_name_or_path=config.model_path)

    Bert_config.output_hidden_states = True  # 获取每一层的输出

    model.to(device)
    """多卡训练"""
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # optimizer
    # Prepare optimizer
    # fine-tuning
    # 取模型权重
    param_optimizer = list(model.named_parameters())
    # pretrain model param       预训练的参数
    param_pre = [(n, p) for n, p in param_optimizer
                 if 'bert' in n or 'electra' in n]  # nezha的命名为bert
    # middle model param         中等参数
    param_middle = [
        (n, p) for n, p in param_optimizer
        if not any([s in n for s in ('bert', 'crf', 'electra',
                                     'albert')]) or 'dym_weight' in n
    ]
    # crf param
    # 不进行衰减的权重
    no_decay = ['bias', 'LayerNorm', 'dym_weight', 'layer_norm']
    # 将权重分组
    optimizer_grouped_parameters = [
        # pretrain model param  预训练的参数
        # 衰减
        {
            'params':
            [p for n, p in param_pre if not any(nd in n for nd in no_decay)],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.embed_learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_pre if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
            'lr': config.embed_learning_rate
        },
        # middle model     中等参数
        # 衰减
        {
            'params': [
                p for n, p in param_middle
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_middle if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0,
            'lr':
            config.learning_rate
        },
    ]
    num_train_optimization_steps = train_iter.num_records // config.gradient_accumulation_steps * config.train_epoch
    optimizer = BertAdam(optimizer_grouped_parameters,
                         warmup=config.warmup_proportion,
                         schedule="warmup_cosine",
                         t_total=num_train_optimization_steps)
    logger.info("***** Running training *****")
    logger.info("  Batch size = %d", config.batch_size)
    logger.info("  Num epochs = %d", config.train_epoch)
    logger.info("  Learning rate = %f", config.learning_rate)

    cum_step = 0
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(
        os.path.join(config.save_model, "runs_" + str(gpu_id), timestamp))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print("Writing to {}\n".format(out_dir))

    draw_step_list = []
    draw_loss_list = []
    for i in range(config.train_epoch):
        model.train()
        for input_ids_list, input_mask_list, segment_ids_list, label_ids_list, tokens_list in tqdm(
                train_iter):
            # 转成张量
            loss = model(input_ids=list2ts2device(input_ids_list),
                         token_type_ids=list2ts2device(segment_ids_list),
                         attention_mask=list2ts2device(input_mask_list),
                         labels=list2ts2device(label_ids_list))
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # 梯度累加
            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps

            if cum_step % 10 == 0:
                draw_step_list.append(cum_step)
                draw_loss_list.append(loss)
                if cum_step % 100 == 0:
                    format_str = 'step {}, loss {:.4f} lr {:.5f}'
                    print(
                        format_str.format(cum_step, loss,
                                          config.learning_rate))

            loss.backward()  # 反向传播,得到正常的grad
            if (cum_step + 1) % config.gradient_accumulation_steps == 0:
                # performs updates using calculated gradients
                optimizer.step()
                model.zero_grad()
            cum_step += 1
        p, r, f1 = set_test(model, test_iter)
        # lr_scheduler学习率递减 step

        print('dev set : step_{},precision_{}, recall_{}, F1_{}'.format(
            cum_step, p, r, f1))

        # 保存模型
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            os.path.join(
                out_dir, 'model_{:.4f}_{:.4f}_{:.4f}_{}.bin'.format(
                    p, r, f1, str(cum_step))))
        torch.save(model_to_save, output_model_file)

    with open(Config().processed_data + 'step_loss_data.pickle', 'wb') as mf:
        draw_dict = {'step': draw_step_list, 'loss': draw_loss_list}
        pickle.dump(draw_dict, mf)
Example #16
0
def main(args):
    args.data_dir = os.path.join(args.data_dir, args.task_name)
    args.output_dir = os.path.join(args.output_dir, args.task_name)
    logger.info("args = %s", args)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "rte": RteProcessor,
        "wnli": WnliProcessor,
        "emo": EmoProcessor,
    }

    output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification",
        "wnli": "classification",
        "emo": "classification"
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

        # device = torch.device('cpu')

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        logger.info("Output directory already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        try:
            os.makedirs(args.output_dir)
        except:
            pass
            logger.info("catch a error")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # tokenizer = BertTokenizer.from_pretrained(args.vocab_file, do_lower_case=args.do_lower_case)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))

    # use bert to aug train_examples
    ori_train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_dev_examples(args.data_dir)
    test_examples = processor.get_test_examples(args.data_dir)

    num_train_optimization_steps = int(
        len(ori_train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs

    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    if args.use_saved == 1:
        bert_saved_dir = args.ckpt
        model = BertForNSPAug.from_pretrained(bert_saved_dir,
                                              cache_dir=args.ckpt_cache_dir,
                                              num_labels=num_labels,
                                              args=args)
    else:
        model = BertForNSPAug.from_pretrained(args.bert_model,
                                              cache_dir=cache_dir,
                                              num_labels=num_labels,
                                              args=args)
    model.cuda()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        global_step = 0
        best_val_acc = 0.0
        first_time = time.time()

        logger.info(
            "*********************************** Running training ***********************************"
        )
        logger.info("  Num original examples = %d", len(ori_train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        aug_ratio = 0.0
        # aug_ratio = 0.2
        aug_seed = np.random.randint(0, 1000)
        for epoch in range(int(args.num_train_epochs)):
            logger.info("epoch=%d,  aug_ratio = %f,  aug_seed=%d", epoch,
                        aug_ratio, aug_seed)
            train_examples = Aug_each_ckpt(ori_train_examples,
                                           label_list,
                                           model,
                                           tokenizer,
                                           args=args,
                                           num_show=args.num_show,
                                           output_mode=output_mode,
                                           seed=aug_seed,
                                           aug_ratio=aug_ratio,
                                           use_bert=False)
            if aug_ratio + args.aug_ratio_each < 1.0:
                aug_ratio += args.aug_ratio_each
            aug_seed += 1

            train_features = convert_examples_to_features(
                train_examples,
                label_list,
                args.max_seq_length,
                tokenizer,
                num_show=args.num_show,
                output_mode=output_mode,
                args=args)
            logger.info(
                "*********************************** Done convert features ***********************************"
            )
            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if output_mode == "classification":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            elif output_mode == "regression":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float)

            token_real_label = torch.tensor(
                [f.token_real_label for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids,
                                       token_real_label)
            if args.local_rank == -1:
                train_sampler = RandomSampler(train_data)
            else:
                train_sampler = DistributedSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            logger.info(
                "*********************************** begin training ***********************************"
            )
            tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0
            nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0
            preds = []
            all_labels = []
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.cuda() for t in batch)
                input_ids, input_mask, segment_ids, label_ids, token_real_label = batch
                seq_logits, aug_logits, aug_loss = model(
                    input_ids,
                    segment_ids,
                    input_mask,
                    labels=None,
                    token_real_label=token_real_label)
                if output_mode == "classification":
                    # if task_name == "emo":
                    #     loss_fct =
                    # else:
                    loss_fct = CrossEntropyLoss()
                    seq_loss = loss_fct(seq_logits.view(-1, num_labels),
                                        label_ids.view(-1))
                    # print("[classification]label_ids: {}, size: {}".format(label_ids.view(-1), label_ids.view(-1).size()))
                    # print("[classification]seq_logits size: {}".format(seq_logits.view(-1, num_labels).size()))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    seq_loss = loss_fct(seq_logits.view(-1),
                                        label_ids.view(-1))

                token_real_label = token_real_label.detach().cpu().numpy()

                w = args.aug_loss_weight
                loss = (1 - w) * seq_loss + w * aug_loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                total_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), 10000.0)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                batch_loss = seq_loss.mean().item()
                tr_seq_loss += seq_loss.mean().item()
                seq_logits = seq_logits.detach().cpu().numpy()
                label_ids = label_ids.detach().cpu().numpy()
                if len(preds) == 0:
                    preds.append(seq_logits)
                    all_labels.append(label_ids)
                else:
                    preds[0] = np.append(preds[0], seq_logits, axis=0)
                    all_labels[0] = np.append(all_labels[0], label_ids, axis=0)

                aug_logits = aug_logits.detach().cpu().numpy()
                tmp_train_aug_accuracy, tmp_tokens = accuracy(aug_logits,
                                                              token_real_label,
                                                              type="aug")
                train_aug_accuracy += tmp_train_aug_accuracy
                nb_tr_tokens += tmp_tokens
                tr_aug_loss += aug_loss.mean().item()

                if global_step % 20 == 0:
                    loss = tr_loss / nb_tr_steps
                    seq_loss = tr_seq_loss / nb_tr_steps
                    aug_loss = tr_aug_loss / nb_tr_steps
                    tmp_pred = preds[0]
                    tmp_labels = all_labels[0]
                    if output_mode == "classification":
                        tmp_pred = np.argmax(tmp_pred, axis=1)
                    elif output_mode == "regression":
                        tmp_pred = np.squeeze(tmp_pred)
                    res = accuracy(tmp_pred, tmp_labels, task_name=task_name)

                    if nb_tr_tokens != 0:
                        aug_avg = train_aug_accuracy / nb_tr_tokens
                    else:
                        aug_avg = 0.0
                    log_string = ""
                    log_string += "epoch={:<5d}".format(epoch)
                    log_string += " step={:<9d}".format(global_step)
                    log_string += " total_loss={:<9.7f}".format(loss)
                    log_string += " seq_loss={:<9.7f}".format(seq_loss)
                    log_string += " aug_loss={:<9.7f}".format(aug_loss)
                    log_string += " batch_loss={:<9.7f}".format(batch_loss)
                    log_string += " lr={:<9.7f}".format(optimizer.get_lr()[0])
                    log_string += " |g|={:<9.7f}".format(total_norm)
                    #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg)
                    log_string += " tr_aug_acc={:<9.7f}".format(aug_avg)
                    log_string += " mins={:<9.2f}".format(
                        float(time.time() - first_time) / 60)
                    for key in sorted(res.keys()):
                        log_string += "  " + key + "= " + str(res[key])
                    logger.info(log_string)

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            train_loss = tr_loss / nb_tr_steps

            logger.info(
                "*********************************** training epoch done ***********************************"
            )

            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank()
                                 == 0) and epoch % 1 == 0:
                tot_time = float(time.time() - first_time) / 60
                eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts=\
                 do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev")

                eval_res["tot_time"] = tot_time
                if "acc" in eval_res:
                    tmp_acc = eval_res["acc"]
                elif "mcc" in eval_res:
                    tmp_acc = eval_res["mcc"]
                else:
                    tmp_acc = eval_res["corr"]

                result = {
                    'eval_total_loss': eval_loss,
                    'eval_seq_loss': eval_seq_loss,
                    'eval_aug_loss': eval_aug_loss,
                    'eval_aug_accuracy': eval_aug_accuracy,
                    'global_step': global_step,
                    'train_loss': train_loss,
                    'train_batch_size': args.train_batch_size,
                    'args': args
                }

                if tmp_acc >= best_val_acc:
                    best_val_acc = tmp_acc
                    dev_test = "dev"
                    result.update({'best_epoch': epoch})

                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_dir = os.path.join(args.output_dir,
                                                    "dev_" + str(tmp_acc))
                    if not os.path.exists(output_model_dir):
                        os.makedirs(output_model_dir)
                    output_model_file = os.path.join(output_model_dir,
                                                     WEIGHTS_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    output_config_file = os.path.join(output_model_dir,
                                                      CONFIG_NAME)
                    with open(output_config_file, 'w') as f:
                        f.write(model_to_save.config.to_json_string())

                result.update(eval_res)
                result.update(res_parts)

                # output_eval_file = os.path.join(args.output_dir,
                # 								dev_test + "_results_" + str(tmp_acc) + ".txt")
                # with open(output_eval_file, "w") as writer:
                logger.info(
                    "****************************** eval results ***********************************"
                )
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    # writer.write("%s = %s\n" % (key, str(result[key])))
            else:
                result = {
                    'eval_total_loss': eval_loss,
                    'eval_seq_loss': eval_seq_loss,
                    'eval_aug_loss': eval_aug_loss,
                    'eval_aug_accuracy': eval_aug_accuracy,
                    'global_step': global_step,
                    'train_loss': train_loss,
                    'train_batch_size': args.train_batch_size,
                    'args': args
                }

                result.update(eval_res)
                result.update(res_parts)
                logger.info(
                    "****************************** eval results ***********************************"
                )
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

            # write test results
            if args.do_test:
                # res_file = os.path.join(args.output_dir,
                # 							"test_" + str(tmp_acc)+".tsv")

                # idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model)

                # dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds})
                # dataframe.to_csv(res_file, index=False, sep='\t')
                # logger.info("  Num test length = %d", idx)
                logger.info(
                    "*********************************** Running test ***********************************"
                )
                logger.info("  Num examples = %d", len(test_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                test_loss, test_seq_loss, test_aug_loss, test_res, test_aug_accuracy, res_parts=\
                 do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, test_examples, type="test")
                result = {
                    'test_total_loss': test_loss,
                    'test_seq_loss': test_seq_loss,
                    'test_aug_loss': test_aug_loss,
                    'test_aug_accuracy': test_aug_accuracy,
                    'global_step': global_step,
                    'args': args
                }
                result.update(test_res)
                result.update(res_parts)

                logger.info(
                    "****************************** test results ***********************************"
                )
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

                logger.info(
                    "*********************************** test done ***********************************"
                )
Example #17
0
class Trainer:
    def is_main_process(self):
        return self.team_rank == 0

    def parse_arguments(self):
        parser = argparse.ArgumentParser()

        # Required parameters
        parser.add_argument("--input_file",
                            default=None,
                            type=str,
                            required=True,
                            help="The input data file. Should be zip file "
                            "containing .hdf5 files for the task.")

        parser.add_argument("--config_file",
                            default=None,
                            type=str,
                            required=True,
                            help="The BERT model config")

        parser.add_argument("--bert_model",
                            default="bert-large-uncased",
                            type=str,
                            help="Bert pre-trained model selected in the "
                            "list: bert-base-uncased, bert-large-uncased, "
                            "bert-base-cased, bert-base-multilingual, "
                            "bert-base-chinese.")

        parser.add_argument("--output_dir",
                            default=None,
                            type=str,
                            required=True,
                            help="The output directory where the model "
                            "checkpoints will be written.")

        # Other parameters
        parser.add_argument("--max_seq_length",
                            default=512,
                            type=int,
                            help="The maximum total input sequence length "
                            "after WordPiece tokenization. \n"
                            "Sequences longer than this will be truncated, "
                            "and sequences shorter \n"
                            "than this will be padded.")
        parser.add_argument("--max_predictions_per_seq",
                            default=80,
                            type=int,
                            help="The maximum total of masked tokens in input "
                            "sequence")
        parser.add_argument("--train_batch_size",
                            default=32,
                            type=int,
                            help="Total batch size for training.")
        parser.add_argument("--learning_rate",
                            default=5e-5,
                            type=float,
                            help="The initial learning rate for Adam.")
        parser.add_argument("--max_steps",
                            default=1000,
                            type=float,
                            help="Total number of training steps to perform.")
        parser.add_argument("--warmup_proportion",
                            default=0.01,
                            type=float,
                            help="Proportion of training to perform linear "
                            "learning rate warmup for. "
                            "E.g., 0.1 = 10%% of training.")
        parser.add_argument("--local_rank",
                            type=int,
                            default=-1,
                            help="local_rank for distributed training on gpus")
        parser.add_argument('--seed',
                            type=int,
                            default=42,
                            help="random seed for initialization")
        parser.add_argument('--log_freq',
                            type=float,
                            default=50.0,
                            help='frequency of logging loss.')
        parser.add_argument('--checkpoint_activations',
                            default=False,
                            action='store_true',
                            help="Whether to use gradient checkpointing")
        parser.add_argument("--resume_from_checkpoint",
                            default=False,
                            action='store_true',
                            help="Whether to resume training from checkpoint.")
        parser.add_argument('--resume_step',
                            type=int,
                            default=-1,
                            help="Step to resume training from.")
        parser.add_argument('--num_steps_per_checkpoint',
                            type=int,
                            default=100,
                            help="Number of update steps until a model "
                            "checkpoint is saved to disk.")
        parser.add_argument('--phase2',
                            default=False,
                            action='store_true',
                            help="Whether to train with seq len 512")
        parser.add_argument('--phase1_end_step',
                            type=int,
                            default=7038,
                            help="Number of training steps in Phase1 - "
                            "seq len 128")
        parser.add_argument('--online_distillation',
                            type=str,
                            default="none",
                            choices=["none", "original", "overlap", "logit"],
                            help="Settings for online distillation")
        parser.add_argument('--burnin_steps', type=int, default=0)
        parser.add_argument('--distillation_weight', type=float, default=1)
        parser.add_argument('--distillation_loss',
                            type=str,
                            default="kl_divergence",
                            choices=["cross_entropy", "kl_divergence"])
        parser.add_argument('--distillation_steps', type=int, default=50)
        parser.add_argument('--optimizer',
                            type=str,
                            default="lamb",
                            choices=["lamb", "adam"])
        self.args = parser.parse_args()

    def setup_training(self):
        assert (torch.cuda.is_available())

        torch.cuda.set_device(self.args.local_rank)
        self.device = torch.device("cuda", self.args.local_rank)
        # Initializes the distributed backend which will take care of
        # sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

        self.rank = torch.distributed.get_rank()
        self.size = torch.distributed.get_world_size()
        if self.args.online_distillation == "none":
            self.team = 0
            self.team_masters = [0]
            self.team_master = 0
            self.local_group = torch.distributed.new_group(
                ranks=list(range(0, self.size)))
            self.team_rank = torch.distributed.get_rank()
            self.team_size = torch.distributed.get_world_size()
        else:
            assert self.size % 2 == 0, \
                'with distillation, world size must be a multiple of 2'
            self.team = self.rank // (self.size // 2)
            self.team_masters = [0, (self.size // 2)]
            self.team_master = self.team_masters[self.team]
            self.is_team_master = (self.rank % (self.size // 2) == 0)
            local_group0 = torch.distributed.new_group(
                ranks=list(range(0, self.size // 2)))
            local_group1 = torch.distributed.new_group(
                ranks=list(range(self.size // 2, self.size)))
            self.local_groups = [local_group0, local_group1]
            self.local_group = self.local_groups[self.team]

            self.team_rank = self.rank % (self.size // 2)
            self.team_size = self.size // 2

            comm_model_group_rank0 = \
                [0] + list(range(self.team_size, self.team_size * 2))
            comm_model_group_rank1 = \
                [self.team_size] + list(range(0, self.team_size))
            self.comm_model_group_ranks = [
                comm_model_group_rank0, comm_model_group_rank1
            ]

            if self.args.online_distillation == "logit":
                for i in range(0, self.size // 2):
                    ranks = [i, i + self.size // 2]
                    grp = torch.distributed.new_group(ranks=ranks)
                    if self.rank in ranks:
                        self.equalize_data_group = grp
                # use different seeds in different teams
                self.args.data_seed = 12345
                self.args.seed += self.team * 12345
            else:
                # use different seeds in different teams
                self.args.seed += self.team * 12345

        self.args.train_batch_size //= self.team_size

        if not self.args.resume_from_checkpoint:
            chio.makedirs(self.args.output_dir, exist_ok=True)

    def prepare_model_and_optimizer(self):
        # Prepare model
        self.config = BertConfig.from_json_file(self.args.config_file)

        # Padding for divisibility by 8
        if self.config.vocab_size % 8 != 0:
            self.config.vocab_size += 8 - (self.config.vocab_size % 8)
        self.model = BertForPreTraining(self.config)
        self.another_model = BertForPreTraining(self.config)

        self.model.to(self.device)
        self.another_model.to(self.device)
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

        optimizer_grouped_parameters = []
        names = []

        for n, p in param_optimizer:
            if not any(nd in n for nd in no_decay):
                optimizer_grouped_parameters.append({
                    'params': [p],
                    'weight_decay': 0.01,
                    'name': n
                })
                names.append({'params': [n], 'weight_decay': 0.01})
            if any(nd in n for nd in no_decay):
                optimizer_grouped_parameters.append({
                    'params': [p],
                    'weight_decay': 0.00,
                    'name': n
                })
                names.append({'params': [n], 'weight_decay': 0.00})

        if self.args.phase2:
            max_steps = self.args.max_steps
            tmp = max_steps * 10
            r = self.args.phase1_end_step / tmp
            lr = self.args.learning_rate * (1 - r)
        else:
            max_steps = int(self.args.max_steps / 9 * 10)
            lr = self.args.learning_rate
        if self.args.optimizer == "lamb":
            self.optimizer = BertLAMB(optimizer_grouped_parameters,
                                      lr=lr,
                                      warmup=self.args.warmup_proportion
                                      if not self.args.phase2 else -1,
                                      t_total=max_steps)
        elif self.args.optimizer == "adam":
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=lr,
                                      warmup=self.args.warmup_proportion
                                      if not self.args.phase2 else -1,
                                      t_total=max_steps)

    def prepare_snapshot(self):
        self.snapshot = Snapshot(self.args, self.model, self.another_model,
                                 self.optimizer, self.team)
        flat_dist_call([param.data for param in self.model.parameters()],
                       torch.distributed.broadcast,
                       (self.team_master, self.local_group))

    def forward(self, model, batch, calc_loss=True):
        input_ids, segment_ids, input_mask, \
            masked_lm_labels, next_sentence_labels = batch
        if calc_loss:
            return model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                attention_mask=input_mask,
                masked_lm_labels=masked_lm_labels,
                next_sentence_label=next_sentence_labels,
                checkpoint_activations=self.args.checkpoint_activations)
        else:
            return model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                attention_mask=input_mask,
                masked_lm_labels=None,
                next_sentence_label=None,
                checkpoint_activations=self.args.checkpoint_activations)

    def backward(self, loss):
        loss.backward()

    def comm_model(self):
        for i in range(2):
            root = self.comm_model_group_ranks[i][0]
            teams = set(range(root, root + self.team_size))
            if self.rank in teams:
                flat_dist_call(
                    [param.data for param in self.model.parameters()],
                    torch.distributed.broadcast, (i * self.team_size, ))
            else:
                flat_dist_call(
                    [param.data for param in self.another_model.parameters()],
                    torch.distributed.broadcast, (i * self.team_size, ))

    def all_reduce(self, overflow_buf, accum=1):
        scaler = amp.scaler.LossScaler(1.0)

        # 1. allocate an uninitialized buffer for flattened gradient
        master_grads = [
            p.grad for p in amp.master_params(self.optimizer)
            if p.grad is not None
        ]
        flat_grad_size = sum(p.numel() for p in master_grads)
        allreduce_dtype = torch.float32
        flat_raw = torch.empty(flat_grad_size,
                               device='cuda',
                               dtype=allreduce_dtype)
        # 2. combine unflattening and predivision of unscaled 'raw' gradient
        allreduced_views = apex_C.unflatten(flat_raw, master_grads)
        overflow_buf.zero_()
        amp_C.multi_tensor_scale(
            65536, overflow_buf, [master_grads, allreduced_views],
            scaler.loss_scale() / (self.team_size * accum))
        # 3. sum gradient across ranks. Because of the predivision,
        #    this averages the gradient
        torch.distributed.all_reduce(flat_raw, group=self.local_group)
        # 4. combine unscaling and unflattening of allreduced gradient
        overflow_buf.zero_()
        amp_C.multi_tensor_scale(65536, overflow_buf,
                                 [allreduced_views, master_grads],
                                 1. / scaler.loss_scale())

    def take_optimizer_step(self, global_step):
        # 1. call optimizer step function
        self.optimizer.step()
        global_step += 1
        for param in self.model.parameters():
            param.grad = None

        return global_step

    def init_dataloader(self, epoch, pool, rng=None):
        rng = rng or random
        if not self.args.resume_from_checkpoint or epoch > 0 or \
                self.args.phase2:
            with chio.open_as_container(self.args.input_file) as input_file:
                files = [f for f in input_file.list() if "training" in f]
            files.sort()
            num_files = len(files)
            rng.shuffle(files)
            f_start_id = 0
        else:
            f_start_id = self.snapshot.f_id
            files = self.snapshot.files
            self.args.resume_from_checkpoint = False
            num_files = len(files)

        if torch.distributed.is_initialized() and \
                self.team_size > num_files:
            remainder = self.team_size % num_files
            data_file = files[(f_start_id * self.team_size + self.team_rank +
                               remainder * f_start_id) % num_files]
        else:
            data_file = files[(f_start_id * self.team_size + self.team_rank) %
                              len(files)]

        return pool.submit(create_pretraining_dataset, self.args.input_file,
                           data_file, self.args.max_predictions_per_seq,
                           self.args), f_start_id, files, data_file

    def update_dataloader(self, pool, f_id, files):
        if self.team_size > len(files):
            remainder = self.team_size % len(files)
            data_file = files[(f_id * self.team_size + self.team_rank +
                               remainder * f_id) % len(files)]
        else:
            data_file = files[(f_id * self.team_size + self.team_rank) %
                              len(files)]

        dataset_future = pool.submit(create_pretraining_dataset,
                                     self.args.input_file, data_file,
                                     self.args.max_predictions_per_seq,
                                     self.args)
        return dataset_future, data_file

    def loss(self, prediction_scores, seq_relationship_score, batch):
        _, _, _, masked_lm_labels, next_sentence_labels = batch
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
        masked_lm_loss = loss_fct(
            prediction_scores.view(-1, self.config.vocab_size),
            masked_lm_labels.view(-1))
        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
                                      next_sentence_labels.view(-1))
        return masked_lm_loss + next_sentence_loss

    def compute_distillation_loss(self, output, another_output, target=None):
        c = output.shape[-1]
        output = output.view(-1, c)
        another_output = another_output.view(-1, c)
        with torch.no_grad():
            if target is None:
                mask = torch.ones(len(output),
                                  1,
                                  device=output.device,
                                  dtype=output.dtype)
            else:
                mask = (target != -1).long().view(-1, 1)
        if self.args.distillation_loss == 'cross_entropy':
            other_distr = torch.softmax(another_output, dim=1)
            return -torch.sum(
                mask *
                (torch.log_softmax(output, dim=1) * other_distr)) / sum(mask)
        elif self.args.distillation_loss == 'kl_divergence':
            return torch.sum(
                mask *
                (torch.softmax(output, dim=1) *
                 (torch.log_softmax(output, dim=1) -
                  torch.log_softmax(another_output, dim=1)))) / sum(mask)
        else:
            raise ValueError('unknown distillation loss: {}'.format(
                self.args.distillation_loss))

    def train_simple(self):
        global_step = self.snapshot.global_step or 0
        if self.args.phase2:
            self.args.accum = self.args.train_batch_size // 8
            self.args.train_batch_size = 8
        else:
            self.args.accum = 1

        if self.is_main_process():
            print("SEED {}".format(self.args.seed))
            logger.info("***** Running training *****")
            # logger.info("  Num examples = %d", len(train_data))
            logger.info("  Batch size = %d", self.args.train_batch_size)
            logger.info("  Accum = %d", self.args.accum)
            print("  LR = ", self.args.learning_rate)
            print("Training. . .")

        self.model.train()
        average_loss = 0.0  # averaged loss every self.args.log_freq steps
        epoch = 0

        # Note: We loop infinitely over epochs, termination is handled via
        #       iteration count
        begin = None
        with ThreadPoolExecutor(1) as pool:
            while True:
                dataset_future, f_start_id, files, data_file = \
                    self.init_dataloader(epoch, pool)
                previous_file = data_file
                train_dataloader, _ = dataset_future.result(timeout=None)

                overflow_buf = torch.cuda.IntTensor([0])

                for f_id in range(f_start_id + 1, len(files)):
                    logger.info("file no %s file %s" % (f_id, previous_file))
                    dataset_future, data_file = \
                        self.update_dataloader(pool, f_id, files)
                    previous_file = data_file

                    it = 0
                    for batch in train_dataloader:
                        if begin is None:
                            begin = time.time()
                        it += 1
                        batch = [t.to(self.device) for t in batch]
                        loss = self.forward(self.model, batch)
                        self.backward(loss)
                        average_loss += loss.item()

                        if it % self.args.accum == 0:
                            self.all_reduce(overflow_buf, self.args.accum)
                            global_step = self.take_optimizer_step(global_step)
                            it = 0

                            if global_step % self.args.log_freq == 0:
                                divisor = self.args.log_freq * self.args.accum
                                if self.is_main_process():
                                    print(
                                        "Team: {} Step:{} Average Loss = {} ".
                                        format(self.team, global_step,
                                               average_loss / divisor))
                                average_loss = 0

                            if global_step >= self.args.max_steps or \
                                (global_step %
                                 self.args.num_steps_per_checkpoint) == 0:
                                if self.team_rank == 0:
                                    # Save a trained model
                                    logger.info("** ** Saving model ** **")
                                    self.snapshot.save(global_step, f_id,
                                                       files)

                            if global_step >= self.args.max_steps:
                                del train_dataloader
                                torch.distributed.barrier()
                                if torch.distributed.get_rank() == 0:
                                    print("Total time taken {}".format(
                                        time.time() - begin))
                                return self.args

                    del train_dataloader
                    # Make sure pool has finished and switch train_dataloader
                    # NOTE: Will block until complete
                    train_dataloader, data_file = dataset_future.result(
                        timeout=None)

                epoch += 1

    def train_online_distillation_original(self):
        global_step = self.snapshot.global_step or 0

        if self.is_main_process():
            print("SEED {}".format(self.args.seed))
            logger.info("***** Running training *****")
            # logger.info("  Num examples = %d", len(train_data))
            logger.info("  Batch size = %d", self.args.train_batch_size)
            print("  LR = ", self.args.learning_rate)
            print("  Online Distillation")
            print("Training. . .")

        self.model.train()
        average_loss = 0.0  # averaged loss every self.args.log_freq steps
        average_dloss_0 = 0.0  # averaged loss every self.args.log_freq steps
        average_dloss_1 = 0.0
        epoch = 0
        begin = None

        # Note: We loop infinitely over epochs, termination is handled via
        #       iteration count
        with ThreadPoolExecutor(1) as pool:
            while True:
                dataset_future, f_start_id, files, data_file = \
                    self.init_dataloader(epoch, pool)
                previous_file = data_file
                train_dataloader, _ = dataset_future.result(timeout=None)

                overflow_buf = torch.cuda.IntTensor([0])

                for f_id in range(f_start_id + 1, len(files)):
                    logger.info("file no %s file %s" % (f_id, previous_file))
                    dataset_future, data_file = \
                        self.update_dataloader(pool, f_id, files)
                    previous_file = data_file

                    for batch in train_dataloader:
                        if begin is None:
                            begin = time.time()
                        step = global_step
                        if self.args.phase2:
                            step += self.args.phase1_end_step
                        if step >= self.args.burnin_steps and \
                                (step % self.args.distillation_steps) == 0:
                            self.comm_model()

                        batch = [t.to(self.device) for t in batch]
                        _, _, _, masked_lm_labels, _ = batch
                        if step < self.args.burnin_steps:
                            loss = self.forward(self.model, batch)
                            dloss0 = torch.zeros(())
                            dloss1 = torch.zeros(())
                        else:
                            out0, out1 = self.forward(self.model,
                                                      batch,
                                                      calc_loss=False)
                            with torch.no_grad():
                                aout0, aout1 = self.forward(self.another_model,
                                                            batch,
                                                            calc_loss=False)
                            loss = self.loss(out0, out1, batch)
                            dloss0 = \
                                self.compute_distillation_loss(
                                    out0, aout0, masked_lm_labels.view(-1))
                            dloss1 = \
                                self.compute_distillation_loss(out1, aout1)
                            dloss = dloss0 + dloss1
                            loss = loss + \
                                self.args.distillation_weight * dloss
                        self.backward(loss)
                        self.all_reduce(overflow_buf)
                        global_step = self.take_optimizer_step(global_step)
                        average_loss += loss.item()
                        average_dloss_0 += dloss0.item()
                        average_dloss_1 += dloss1.item()

                        if global_step % self.args.log_freq == 0:
                            divisor = self.args.log_freq
                            if self.is_main_process():
                                print(
                                    "Team: {} Step:{} Average Loss = {} Average dLoss = {} {}"
                                    .format(self.team, global_step,
                                            average_loss / divisor,
                                            average_dloss_0 / divisor,
                                            average_dloss_1 / divisor))
                            average_loss = 0
                            average_dloss_0 = 0
                            average_dloss_1 = 0

                        if global_step >= self.args.max_steps or \
                            (global_step %
                             self.args.num_steps_per_checkpoint) == 0:
                            if self.team_rank == 0:
                                # Save a trained model
                                logger.info("** ** Saving model ** **")
                                self.snapshot.save(global_step, f_id, files)

                            if global_step >= self.args.max_steps:
                                del train_dataloader
                                torch.distributed.barrier()
                                if torch.distributed.get_rank() == 0:
                                    print("Total time taken {}".format(
                                        time.time() - begin))
                                return self.args

                    del train_dataloader
                    # Make sure pool has finished and switch train_dataloader
                    # NOTE: Will block until complete
                    train_dataloader, data_file = dataset_future.result(
                        timeout=None)

                epoch += 1

    def train_online_distillation_overlap(self):
        global_step = self.snapshot.global_step or 0

        main_stream = torch.cuda.Stream()
        another_model_fwd_stream = torch.cuda.Stream()
        all_reduce_stream = torch.cuda.Stream()
        distillation_stream = torch.cuda.Stream()

        fwd_event = torch.cuda.Event()
        bwd_event = torch.cuda.Event()
        another_model_fwd_event = torch.cuda.Event()
        all_reduce_event = torch.cuda.Event()
        distillation_event = torch.cuda.Event()

        if self.is_main_process():
            print("SEED {}".format(self.args.seed))
            logger.info("***** Running training *****")
            # logger.info("  Num examples = %d", len(train_data))
            logger.info("  Batch size = %d", self.args.train_batch_size)
            print("  LR = ", self.args.learning_rate)
            print("  Online Distillation")
            print("Training. . .")

        self.model.train()
        average_loss = 0.0  # averaged loss every self.args.log_freq steps
        average_dloss_0 = 0
        average_dloss_1 = 0
        epoch = 0
        begin = None

        # Note: We loop infinitely over epochs, termination is handled via
        #       iteration count
        batch = None
        another_output = None
        with ThreadPoolExecutor(1) as pool:
            while True:
                dataset_future, f_start_id, files, data_file = \
                    self.init_dataloader(epoch, pool)
                previous_file = data_file
                train_dataloader, _ = dataset_future.result(timeout=None)

                overflow_buf = torch.cuda.IntTensor([0])

                for f_id in range(f_start_id + 1, len(files)):
                    logger.info("file no %s file %s" % (f_id, previous_file))
                    dataset_future, data_file = \
                        self.update_dataloader(pool, f_id, files)
                    previous_file = data_file

                    for next_batch in train_dataloader:
                        next_batch = [t.to(self.device) for t in next_batch]
                        if batch is None:
                            batch = next_batch
                            continue
                        if begin is None:
                            begin = time.time()

                        step = global_step
                        if self.args.phase2:
                            step += self.args.phase1_end_step

                        _, _, _, masked_lm_labels, _ = batch
                        fwd_event.record()
                        distillation_event.record()
                        if step >= self.args.burnin_steps:
                            with torch.cuda.stream(distillation_stream):
                                distillation_event.wait()
                                if (step % self.args.distillation_steps) \
                                        == 0:
                                    self.comm_model()
                                distillation_event.record()

                        with torch.cuda.stream(main_stream):
                            fwd_event.wait()
                            if another_output is None:
                                loss = self.forward(self.model, batch)
                                dloss0 = torch.zeros(())
                                dloss1 = torch.zeros(())
                            else:
                                out0, out1 = self.forward(self.model,
                                                          batch,
                                                          calc_loss=False)
                                aout0, aout1 = another_output
                                loss = self.loss(out0, out1, batch)
                                dloss0 = \
                                    self.compute_distillation_loss(
                                        out0, aout0,
                                        masked_lm_labels.view(-1))
                                dloss1 = \
                                    self.compute_distillation_loss(out1,
                                                                   aout1)
                                dloss = dloss0 + dloss1

                                loss = loss + \
                                    self.args.distillation_weight * dloss
                            fwd_event.record()
                        fwd_event.wait()

                        bwd_event.record()
                        with torch.cuda.stream(main_stream):
                            bwd_event.wait()
                            self.backward(loss)
                            bwd_event.record()
                        bwd_event.wait()
                        distillation_event.wait()

                        all_reduce_event.record()
                        another_model_fwd_event.record()
                        with torch.cuda.stream(all_reduce_stream):
                            all_reduce_event.wait()
                            self.all_reduce(overflow_buf)
                            all_reduce_event.record()

                        if step >= self.args.burnin_steps:
                            with torch.cuda.stream(another_model_fwd_stream):
                                another_model_fwd_event.wait()
                                with torch.no_grad():
                                    another_output = self.forward(
                                        self.another_model,
                                        next_batch,
                                        calc_loss=False)
                                another_model_fwd_event.record()
                        all_reduce_event.wait()
                        another_model_fwd_event.wait()

                        global_step = self.take_optimizer_step(global_step)

                        average_loss += loss.item()
                        average_dloss_0 += dloss0.item()
                        average_dloss_1 += dloss1.item()
                        if global_step % self.args.log_freq == 0:
                            divisor = self.args.log_freq
                            if self.is_main_process():
                                print(
                                    "Team: {} Step:{} Average Loss = {} Average dLoss = {} {}"
                                    .format(self.team, global_step,
                                            average_loss / divisor,
                                            average_dloss_0 / divisor,
                                            average_dloss_1 / divisor))
                            average_loss = 0
                            average_dloss_0 = 0
                            average_dloss_1 = 0

                        if global_step >= self.args.max_steps or \
                            (global_step %
                             self.args.num_steps_per_checkpoint) == 0:
                            if self.team_rank == 0:
                                # Save a trained model
                                logger.info("** ** Saving model ** **")
                                self.snapshot.save(global_step, f_id, files)

                        if global_step >= self.args.max_steps:
                            del train_dataloader
                            torch.distributed.barrier()
                            if torch.distributed.get_rank() == 0:
                                print(
                                    "Total time taken {}".format(time.time() -
                                                                 begin))
                            return self.args
                        batch = next_batch

                    del train_dataloader
                    # Make sure pool has finished and switch train_dataloader
                    # NOTE: Will block until complete
                    train_dataloader, data_file = dataset_future.result(
                        timeout=None)

                epoch += 1

    def train_online_distillation_logit(self):
        global_step = self.snapshot.global_step or 0

        if self.is_main_process():
            print("SEED {}".format(self.args.seed))
            logger.info("***** Running training *****")
            # logger.info("  Num examples = %d", len(train_data))
            logger.info("  Batch size = %d", self.args.train_batch_size)
            print("  LR = ", self.args.learning_rate)
            print("  Online Distillation")
            print("Training. . .")

        self.model.train()
        average_loss = 0.0  # averaged loss every self.args.log_freq steps
        average_dloss_0 = 0.0
        average_dloss_1 = 0.0
        epoch = 0
        begin = None

        # Note: We loop infinitely over epochs, termination is handled via
        #       iteration count
        rng = random.Random(self.args.data_seed)
        cnt = 0
        with ThreadPoolExecutor(1) as pool:
            while True:
                cnt += 1

                step = global_step
                if self.args.phase2:
                    step += self.args.phase1_end_step
                if step < self.args.burnin_steps:
                    dataset_future, f_start_id, files, data_file = \
                        self.init_dataloader(epoch, pool)
                    use_same_data = False
                else:
                    torch.manual_seed(self.args.data_seed + cnt)
                    dataset_future, f_start_id, files, data_file = \
                        self.init_dataloader(epoch, pool, rng)
                    use_same_data = True
                previous_file = data_file
                train_dataloader, _ = dataset_future.result(timeout=None)

                overflow_buf = torch.cuda.IntTensor([0])

                for f_id in range(f_start_id + 1, len(files)):
                    logger.info("file no %s file %s" % (f_id, previous_file))
                    dataset_future, data_file = \
                        self.update_dataloader(pool, f_id, files)
                    previous_file = data_file

                    for batch in train_dataloader:
                        if begin is None:
                            begin = time.time()
                        step = global_step
                        if self.args.phase2:
                            step += self.args.phase1_end_step
                        if step == self.args.burnin_steps and \
                                not use_same_data:
                            break

                        batch = [t.to(self.device) for t in batch]
                        _, _, _, masked_lm_labels, _ = batch

                        aout0 = None
                        aout1 = None
                        if step < self.args.burnin_steps:
                            loss = self.forward(self.model, batch)
                            dloss0 = torch.zeros(())
                            dloss1 = torch.zeros(())
                        else:
                            out0, out1 = self.forward(self.model,
                                                      batch,
                                                      calc_loss=False)
                            mask = masked_lm_labels.view(-1)

                            c = out0.shape[-1]
                            # Send logit that are not maksed
                            dout0 = out0.view(-1, c)
                            dout0 = dout0[mask != -1]
                            with torch.no_grad():
                                aout0 = dout0.detach().clone()
                                aout1 = out1.detach().clone()
                                flat_dist_call([aout0, aout1],
                                               torch.distributed.all_reduce,
                                               (torch.distributed.ReduceOp.SUM,
                                                self.equalize_data_group))
                                aout0 = aout0 * self.size - dout0
                                aout1 = aout1 * self.size - out1
                            loss = self.loss(out0, out1, batch)
                            dloss0 = \
                                self.compute_distillation_loss(dout0, aout0)
                            dloss1 = \
                                self.compute_distillation_loss(out1, aout1)
                            dloss = dloss0 + dloss1
                            loss = loss + \
                                self.args.distillation_weight * dloss
                        self.backward(loss)

                        self.all_reduce(overflow_buf)
                        global_step = self.take_optimizer_step(global_step)

                        average_loss += loss.item()
                        average_dloss_0 += dloss0.item()
                        average_dloss_1 += dloss1.item()
                        if global_step % self.args.log_freq == 0:
                            divisor = self.args.log_freq
                            if self.is_main_process():
                                print(
                                    "Team: {} Step:{} Average Loss = {} Average dLoss = {} {}"
                                    .format(self.team, global_step,
                                            average_loss / divisor,
                                            average_dloss_0 / divisor,
                                            average_dloss_1 / divisor))
                            average_loss = 0
                            average_dloss_0 = 0
                            average_dloss_1 = 0

                        if global_step >= self.args.max_steps or \
                            (global_step %
                             self.args.num_steps_per_checkpoint) == 0:
                            if self.team_rank == 0:
                                # Save a trained model
                                logger.info("** ** Saving model ** **")
                                self.snapshot.save(global_step, f_id, files)

                        if global_step >= self.args.max_steps:
                            del train_dataloader
                            torch.distributed.barrier()
                            if torch.distributed.get_rank() == 0:
                                print(
                                    "Total time taken {}".format(time.time() -
                                                                 begin))
                            return self.args

                    del train_dataloader
                    # Make sure pool has finished and switch train_dataloader
                    # NOTE: Will block until complete
                    train_dataloader, data_file = dataset_future.result(
                        timeout=None)

                    if step == self.args.burnin_steps and not use_same_data:
                        break

                epoch += 1