Python WarmupLinearSchedule Examples

Programming Language: Python

Namespace/Package Name: bert.optimization

Examples at hotexamples.com: 3

Python WarmupLinearSchedule - 3 examples found. These are the top rated real world Python examples of bert.optimization.WarmupLinearSchedule extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

WarmupLinearSchedule(3)

get_lr(3)

Example #1

Show file

def main():
    parser = argparse.ArgumentParser(fromfile_prefix_chars="@")

    parser.add_argument("--pregenerated_data",
                        type=Path,
                        required=True,
                        help="The input train corpus.")

    parser.add_argument("--epochs", type=int, required=True)

    parser.add_argument("--bert_model", type=str, required=True)

    parser.add_argument("--bert_config_file",
                        type=str,
                        default="bert_config.json")
    parser.add_argument("--vocab_file", type=str, default="senti_vocab.txt")

    parser.add_argument('--output_dir', type=Path, required=True)

    parser.add_argument("--model_name", type=str, default="senti_base_model")

    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--world_size", type=int, default=4)
    parser.add_argument("--start_rank", type=int, default=0)
    parser.add_argument("--server", type=str, default="tcp://127.0.0.1:1234")

    parser.add_argument("--load_model", action="store_true")
    parser.add_argument("--load_model_name", type=str, default="large_model")

    parser.add_argument("--save_step", type=int, default=100000)
    parser.add_argument("--train_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for Adam.")

    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")

    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    print("local_rank : ", args.local_rank)

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=args.server,
                                             rank=args.local_rank +
                                             args.start_rank,
                                             world_size=args.world_size)
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logger.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = Tokenizer(
        os.path.join(args.bert_model, "senti_vocab.txt"),
        os.path.join(args.bert_model, "RoBERTa_Sentiment_kor"))

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = math.ceil(total_train_examples /
                                             args.train_batch_size /
                                             args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = math.ceil(
            num_train_optimization_steps / torch.distributed.get_world_size())

    # Prepare model
    config = BertConfig.from_json_file(
        os.path.join(args.bert_model, args.bert_config_file))
    logger.info('{}'.format(config))
    ###############################################
    # Load Model
    if args.load_model:
        load_model_name = os.path.join(args.output_dir, args.load_model_name)
        model = BertForPreTraining.from_pretrained(
            args.bert_model,
            state_dict=torch.load(load_model_name)["state_dict"])
    else:
        model = BertForPreTraining(config)
    ###############################################

    if args.fp16:
        model.half()
    model.to(device)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
            model = DDP(model)

        except ImportError:
            from torch.nn.parallel import DistributedDataParallel as DDP
            model = DDP(model,
                        device_ids=[args.local_rank],
                        output_device=args.local_rank)

    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
    epoch0 = 0
    global_step = 0
    if args.load_model:
        ###############################################
        # Load Model
        logger.info(f"***** Load Model {args.load_model_name} *****")
        loaded_states = torch.load(os.path.join(args.output_dir,
                                                args.load_model_name),
                                   map_location=device)
        optimizer.load_state_dict(loaded_states["optimizer"])

        regex = re.compile(r'\d+epoch')
        epoch0 = int(
            regex.findall(args.load_model_name)[-1].replace('epoch', ''))
        logger.info('extract {} -> epoch0 : {}'.format(args.load_model_name,
                                                       epoch0))

        ###############################################

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {total_train_examples}")
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    model.train()
    # model.eval()
    for epoch in range(epoch0, args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc='training..') as pbar:
            for step, batch in enumerate(train_dataloader):

                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, lm_label_ids = batch

                loss = model(input_ids, input_mask, lm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps

                if (step + 1) % 50 == 0:
                    pbar.set_description(
                        "Epoch = {}, global_step = {}, loss = {:.5f}".format(
                            epoch, global_step + 1, mean_loss))
                    logger.info(
                        "Epoch = {}, global_step = {}, loss = {:.5f}".format(
                            epoch, global_step + 1, mean_loss))

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % args.save_step == 0:
                    if args.local_rank == -1 or args.local_rank == 0:
                        logger.info(
                            "** ** * Saving {} - step model ** ** * ".format(
                                global_step))
                        output_model_file = os.path.join(
                            args.output_dir,
                            args.model_name + "_{}step".format(global_step))
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        state = {
                            "state_dict": model_to_save.state_dict(),
                            "optimizer": optimizer.state_dict()
                        }
                        torch.save(state, output_model_file)

        if args.local_rank == -1 or args.local_rank == 0:
            logger.info(
                "** ** * Saving {} - epoch model ** ** * ".format(epoch))
            output_model_file = os.path.join(
                args.output_dir,
                args.model_name + "_{}epoch".format(epoch + 1))
            model_to_save = model.module if hasattr(model, 'module') else model
            state = {
                "state_dict": model_to_save.state_dict(),
                "optimizer": optimizer.state_dict()
            }
            torch.save(state, output_model_file)

Example #2

Show file

def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--negative_weight", default=1., type=float)
    parser.add_argument("--neutral_words_file", default='data/identity.csv')

    # if true, use test data instead of val data
    parser.add_argument("--test", action='store_true')

    # Explanation specific arguments below

    # whether run explanation algorithms
    parser.add_argument("--explain",
                        action='store_true',
                        help='if true, explain test set predictions')
    parser.add_argument("--debug", action='store_true')

    # which algorithm to run
    parser.add_argument("--algo", choices=['soc'])

    # the output filename without postfix
    parser.add_argument("--output_filename", default='temp.tmp')

    # see utils/config.py
    parser.add_argument("--use_padding_variant", action='store_true')
    parser.add_argument("--mask_outside_nb", action='store_true')
    parser.add_argument("--nb_range", type=int)
    parser.add_argument("--sample_n", type=int)

    # whether use explanation regularization
    parser.add_argument("--reg_explanations", action='store_true')
    parser.add_argument("--reg_strength", type=float)
    parser.add_argument("--reg_mse", action='store_true')

    # whether discard other neutral words during regularization. default: False
    parser.add_argument("--discard_other_nw",
                        action='store_false',
                        dest='keep_other_nw')

    # whether remove neutral words when loading datasets
    parser.add_argument("--remove_nw", action='store_true')

    # if true, generate hierarchical explanations instead of word level outputs.
    # Only useful when the --explain flag is also added.
    parser.add_argument("--hiex", action='store_true')
    parser.add_argument("--hiex_tree_height", default=5, type=int)

    # whether add the sentence itself to the sample set in SOC
    parser.add_argument("--hiex_add_itself", action='store_true')

    # the directory where the lm is stored
    parser.add_argument("--lm_dir", default='runs/lm')

    # if configured, only generate explanations for instances with given line numbers
    parser.add_argument("--hiex_idxs", default=None)
    # if true, use absolute values of explanations for hierarchical clustering
    parser.add_argument("--hiex_abs", action='store_true')

    # if either of the two is true, only generate explanations for positive / negative instances
    parser.add_argument("--only_positive", action='store_true')
    parser.add_argument("--only_negative", action='store_true')

    # stop after generating x explanation
    parser.add_argument("--stop", default=100000000, type=int)

    # early stopping with decreasing learning rate. 0: direct exit when validation F1 decreases
    parser.add_argument("--early_stop", default=5, type=int)

    # other external arguments originally here in pytorch_transformers

    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--validate_steps",
                        default=200,
                        type=int,
                        help="validate once for how many steps")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    combine_args(configs, args)
    args = configs

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        'gab': GabProcessor,
        'ws': WSProcessor,
        'nyt': NytProcessor,
        'MT': MTProcessor,
        #'multi-label': multilabel_Processor,
    }

    output_modes = {
        'gab': 'classification',
        'ws': 'classification',
        'nyt': 'classification'
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # save configs
    f = open(os.path.join(args.output_dir, 'args.json'), 'w')
    json.dump(args.__dict__, f, indent=4)
    f.close()

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    processor = processors[task_name](configs, tokenizer=tokenizer)
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    if args.do_train:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, cache_dir=cache_dir, num_labels=num_labels)

    else:
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
    model.to(device)

    if args.fp16:
        model.half()

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    # elif n_gpu > 1:
    #     model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)

    else:
        if args.do_train:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss, tr_reg_loss = 0, 0
    tr_reg_cnt = 0
    epoch = -1
    val_best_f1 = -1
    val_best_loss = 1e10
    early_stop_countdown = args.early_stop

    if args.reg_explanations:
        train_lm_dataloder = processor.get_dataloader('train',
                                                      configs.train_batch_size)
        dev_lm_dataloader = processor.get_dataloader('dev',
                                                     configs.train_batch_size)
        explainer = SamplingAndOcclusionExplain(
            model,
            configs,
            tokenizer,
            device=device,
            vocab=tokenizer.vocab,
            train_dataloader=train_lm_dataloder,
            dev_dataloader=dev_lm_dataloader,
            lm_dir=args.lm_dir,
            output_path=os.path.join(configs.output_dir,
                                     configs.output_filename),
        )
    else:
        explainer = None

    if args.do_train:
        epoch = 0
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer, output_mode,
                                                      configs)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        class_weight = torch.FloatTensor([args.negative_weight, 1]).to(device)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss(class_weight)
                    loss = loss_fct(logits.view(-1, num_labels),
                                    label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                # regularize explanations
                # NOTE: backward performed inside this function to prevent OOM

                if args.reg_explanations:
                    reg_loss, reg_cnt = explainer.compute_explanation_loss(
                        input_ids,
                        input_mask,
                        segment_ids,
                        label_ids,
                        do_backprop=True)
                    tr_reg_loss += reg_loss  # float
                    tr_reg_cnt += reg_cnt

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if global_step % args.validate_steps == 0:
                    val_result = validate(args, model, processor, tokenizer,
                                          output_mode, label_list, device,
                                          num_labels, task_name, tr_loss,
                                          global_step, epoch, explainer)
                    val_acc, val_f1 = val_result['acc'], val_result['f1']
                    if val_f1 > val_best_f1:
                        val_best_f1 = val_f1
                        if args.local_rank == -1 or torch.distributed.get_rank(
                        ) == 0:
                            save_model(args, model, tokenizer, num_labels)
                    else:
                        # halve the learning rate
                        for param_group in optimizer.param_groups:
                            param_group['lr'] *= 0.5
                        early_stop_countdown -= 1
                        logger.info(
                            "Reducing learning rate... Early stop countdown %d"
                            % early_stop_countdown)
                    if early_stop_countdown < 0:
                        break
            if early_stop_countdown < 0:
                break
            epoch += 1

            # training finish ############################

    # if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
    #     if not args.explain:
    #         args.test = True
    #         validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels,
    #                  task_name, tr_loss, global_step=0, epoch=-1, explainer=explainer)
    #     else:
    #         args.test = True
    #         explain(args, model, processor, tokenizer, output_mode, label_list, device)
    if not args.explain:
        args.test = True
        print('--Test_args.test: %s' % str(args.test))  #Test_args.test: True
        validate(args,
                 model,
                 processor,
                 tokenizer,
                 output_mode,
                 label_list,
                 device,
                 num_labels,
                 task_name,
                 tr_loss,
                 global_step=888,
                 epoch=-1,
                 explainer=explainer)
        args.test = False
    else:
        print('--Test_args.test: %s' % str(args.test))  # Test_args.test: True
        args.test = True
        explain(args, model, processor, tokenizer, output_mode, label_list,
                device)
        args.test = False

Example #3

Show file

File: run_nlpcc_dp.py Project: yufish/cddp

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help="The output directory where the model checkpoints will be written."
    )

    parser.add_argument("--train_file", default=None, type=str)
    parser.add_argument("--val_file", default=None, type=str)
    parser.add_argument("--test_file", default=None, type=str)
    parser.add_argument("--test_output", default=None, type=str)
    parser.add_argument("--label_vocab", default=None, type=str, required=True)
    parser.add_argument("--punc_set", default='PU', type=str)
    parser.add_argument("--has_confidence", action='store_true')
    parser.add_argument("--only_save_bert", action='store_true')

    parser.add_argument("--arc_space", default=512, type=int)
    parser.add_argument("--type_space", default=128, type=int)

    parser.add_argument("--log_file", default=None, type=str)

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run predict on the test set.")
    parser.add_argument("--do_greedy_predict",
                        action='store_true',
                        help="Whether to run predict on the test set.")
    parser.add_argument("--do_ensemble_predict",
                        action='store_true',
                        help="Whether to run predict on the test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--test_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for test.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.log_file is None:
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
            datefmt='%m/%d/%Y %H:%M:%S',
            level=logging.INFO)
    else:
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
            filename=args.log_file,
            filemode='w',
            datefmt='%m/%d/%Y %H:%M:%S',
            level=logging.INFO)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict and not args.do_greedy_predict and not args.do_ensemble_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        assert args.output_dir is not None

    if args.do_train and os.path.exists(args.output_dir) and os.listdir(
            args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if args.do_train and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    label_vocab, label_vocab2idx = load_label_vocab(args.label_vocab)

    punc_set = set(
        args.punc_set.split(',')) if args.punc_set is not None else None

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        assert args.train_file is not None
        train_examples = read_conll_examples(
            args.train_file,
            is_training=True,
            has_confidence=args.has_confidence)

        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.do_train or args.do_predict or args.do_greedy_predict:
        # load the pretrained model
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
        model = BertForDependencyParsing.from_pretrained(
            args.bert_model,
            cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                   'distributed_{}'.format(args.local_rank)),
            arc_space=args.arc_space,
            type_space=args.type_space,
            num_labels=len(label_vocab))

        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        #
        parser = model.module if hasattr(model, 'module') else model
    elif args.do_ensemble_predict:
        bert_models = args.bert_model.split(',')
        assert len(bert_models) > 1
        tokenizer = BertTokenizer.from_pretrained(
            bert_models[0], do_lower_case=args.do_lower_case)
        models = []
        for bm in bert_models:
            model = BertForDependencyParsing.from_pretrained(
                bm,
                cache_dir=os.path.join(
                    str(PYTORCH_PRETRAINED_BERT_CACHE),
                    'distributed_{}'.format(args.local_rank)),
                arc_space=args.arc_space,
                type_space=args.type_space,
                num_labels=len(label_vocab))
            model.to(device)
            model.eval()
            models.append(model)
        parser = models[0].module if hasattr(models[0],
                                             'module') else models[0]

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        # !!! NOTE why?
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    # start training loop
    if args.do_train:
        global_step = 0
        train_features = convert_examples_to_features(
            train_examples,
            tokenizer,
            args.max_seq_length,
            label_vocab2idx,
            True,
            has_confidence=args.has_confidence)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in train_features],
                                   dtype=torch.long)
        all_heads = torch.tensor([f.heads for f in train_features],
                                 dtype=torch.long)
        all_labels = torch.tensor([f.labels for f in train_features],
                                  dtype=torch.long)

        if args.has_confidence:
            all_confidence = torch.tensor(
                [f.confidence for f in train_features], dtype=torch.float32)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_lengths, all_heads,
                                       all_labels, all_confidence)
        else:
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_lengths, all_heads,
                                       all_labels)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        if args.do_eval:
            assert args.val_file is not None
            eval_examples = read_conll_examples(args.val_file,
                                                is_training=False,
                                                has_confidence=False)
            eval_features = convert_examples_to_features(eval_examples,
                                                         tokenizer,
                                                         args.max_seq_length,
                                                         label_vocab2idx,
                                                         False,
                                                         has_confidence=False)
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            all_example_ids = torch.tensor(
                [f.example_id for f in eval_features], dtype=torch.long)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.float32)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_lengths = torch.tensor([f.seq_len for f in eval_features],
                                       dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_lengths,
                                      all_example_ids)

            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

        best_uas = 0
        best_las = 0
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            logger.info("Training epoch: {}".format(epoch))
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            model.train()
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                if args.has_confidence:
                    input_ids, input_mask, segment_ids, lengths, heads, label_ids, confidence = batch
                else:
                    confidence = None
                    input_ids, input_mask, segment_ids, lengths, heads, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, heads,
                             label_ids, confidence)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if global_step % 100 == 0:
                    logger.info("Training loss: {}, global step: {}".format(
                        tr_loss / nb_tr_steps, global_step))

            # we eval every epoch
            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank() == 0):
                logger.info("***** Running evaluation *****")

                model.eval()

                eval_predict_words, eval_predict_postags, eval_predict_heads, eval_predict_labels = [],[],[],[]

                for input_ids, input_mask, segment_ids, lengths, example_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    example_ids = example_ids.numpy()

                    batch_words = [
                        eval_features[eid].example.sentence
                        for eid in example_ids
                    ]
                    batch_postags = [
                        eval_features[eid].example.postags
                        for eid in example_ids
                    ]
                    batch_word_index = [
                        eval_features[eid].word_index for eid in example_ids
                    ]  # token -> word
                    batch_token_starts = [
                        eval_features[eid].token_starts for eid in example_ids
                    ]  # word -> token start
                    batch_heads = [
                        eval_features[eid].example.heads for eid in example_ids
                    ]

                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    heads = heads.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        # tmp_eval_loss = model(input_ids, segment_ids, input_mask, heads, label_ids)
                        energy = model(input_ids, segment_ids, input_mask)

                    heads_pred, labels_pred = parser.decode_MST(
                        energy.cpu().numpy(),
                        lengths.numpy(),
                        leading_symbolic=0,
                        labeled=True)

                    # we convert the subword dependency parsing to word dependency parsing just the word and token start map
                    pred_heads = []
                    pred_labels = []
                    for i in range(len(batch_word_index)):
                        word_index = batch_word_index[i]
                        token_starts = batch_token_starts[i]
                        hpd = []
                        lpd = []
                        for j in range(len(token_starts)):
                            if j == 0:  #[CLS]
                                continue
                            elif j == len(token_starts) - 1:  # [SEP]
                                continue
                            else:
                                hpd.append(
                                    word_index[heads_pred[i, token_starts[j]]])
                                lpd.append(
                                    label_vocab[labels_pred[i,
                                                            token_starts[j]]])
                        pred_heads.append(hpd)
                        pred_labels.append(lpd)

                    eval_predict_words += batch_words
                    eval_predict_postags += batch_postags
                    eval_predict_heads += pred_heads
                    eval_predict_labels += pred_labels

                eval_output_file = os.path.join(args.output_dir, 'eval.pred')

                write_conll_examples(eval_predict_words, eval_predict_postags,
                                     eval_predict_heads, eval_predict_labels,
                                     eval_output_file)

                eval_f = os.popen(
                    "python scripts/eval_nlpcc_dp.py " + args.val_file + " " +
                    eval_output_file, "r")
                result_text = eval_f.read().strip()
                logger.info("***** Eval results *****")
                logger.info(result_text)
                eval_f.close()
                eval_res = re.findall(
                    r'UAS = \d+/\d+ = ([\d\.]+), LAS = \d+/\d+ = ([\d\.]+)',
                    result_text)
                assert len(eval_res) > 0
                eval_res = eval_res[0]

                eval_uas = float(eval_res[0])
                eval_las = float(eval_res[1])

                # save model
                if best_las < eval_las or (eval_las == best_las
                                           and best_uas < eval_uas):
                    best_uas = eval_uas
                    best_las = eval_las

                    logger.info(
                        "new best uas  %.2f%% las %.2f%%, saving models.",
                        best_uas, best_las)

                    # Save a trained model, configuration and tokenizer
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self

                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)

                    model_dict = model_to_save.state_dict()
                    if args.only_save_bert:
                        model_dict = {
                            k: v
                            for k, v in model_dict.items() if 'bert.' in k
                        }

                    torch.save(model_dict, output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)

    # start predict
    if args.do_predict:
        model.eval()
        assert args.test_file is not None
        test_examples = read_conll_examples(args.test_file,
                                            is_training=False,
                                            has_confidence=False)
        test_features = convert_examples_to_features(test_examples,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     label_vocab2idx,
                                                     False,
                                                     has_confidence=False)
        logger.info("***** Running prediction *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_example_ids = torch.tensor([f.example_id for f in test_features],
                                       dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in test_features],
                                   dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_lengths,
                                  all_example_ids)

        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)

        test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[]
        for batch_id, batch in enumerate(
                tqdm(test_dataloader, desc="Predicting")):
            input_ids, input_mask, segment_ids, lengths, example_ids = batch
            example_ids = example_ids.numpy()
            batch_words = [
                test_features[eid].example.sentence for eid in example_ids
            ]
            batch_postags = [
                test_features[eid].example.postags for eid in example_ids
            ]
            batch_word_index = [
                test_features[eid].word_index for eid in example_ids
            ]  # token -> word
            batch_token_starts = [
                test_features[eid].token_starts for eid in example_ids
            ]  # word -> token start

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            lengths = lengths.numpy()

            with torch.no_grad():
                energy = model(input_ids, segment_ids, input_mask)

            heads_pred, labels_pred = parser.decode_MST(energy.cpu().numpy(),
                                                        lengths,
                                                        leading_symbolic=0,
                                                        labeled=True)

            pred_heads = []
            pred_labels = []
            for i in range(len(batch_word_index)):
                word_index = batch_word_index[i]
                token_starts = batch_token_starts[i]
                hpd = []
                lpd = []
                for j in range(len(token_starts)):
                    if j == 0:  #[CLS]
                        continue
                    elif j == len(token_starts) - 1:  # [SEP]
                        continue
                    else:
                        hpd.append(word_index[heads_pred[i, token_starts[j]]])
                        lpd.append(label_vocab[labels_pred[i,
                                                           token_starts[j]]])
                pred_heads.append(hpd)
                pred_labels.append(lpd)

            test_predict_words += batch_words
            test_predict_postags += batch_postags
            test_predict_heads += pred_heads
            test_predict_labels += pred_labels

        assert args.test_output is not None
        write_conll_examples(test_predict_words, test_predict_postags,
                             test_predict_heads, test_predict_labels,
                             args.test_output)

    if args.do_greedy_predict:
        model.eval()
        assert args.test_file is not None
        test_examples = read_conll_examples(args.test_file,
                                            is_training=False,
                                            has_confidence=False)
        test_features = convert_examples_to_features(test_examples,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     label_vocab2idx,
                                                     False,
                                                     has_confidence=False)
        logger.info("***** Running prediction *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_example_ids = torch.tensor([f.example_id for f in test_features],
                                       dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in test_features],
                                   dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_lengths,
                                  all_example_ids)

        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)

        test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[]
        for batch_id, batch in enumerate(
                tqdm(test_dataloader, desc="Predicting")):
            input_ids, input_mask, segment_ids, lengths, example_ids = batch
            example_ids = example_ids.numpy()
            batch_words = [
                test_features[eid].example.sentence for eid in example_ids
            ]
            batch_postags = [
                test_features[eid].example.postags for eid in example_ids
            ]
            batch_word_index = [
                test_features[eid].word_index for eid in example_ids
            ]  # token -> word
            batch_token_starts = [
                test_features[eid].token_starts for eid in example_ids
            ]  # word -> token start

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            lengths = lengths.numpy()

            with torch.no_grad():
                heads_pred, labels_pred = model(input_ids,
                                                segment_ids,
                                                input_mask,
                                                greedy_inference=True)

            pred_heads = []
            pred_labels = []
            for i in range(len(batch_word_index)):
                word_index = batch_word_index[i]
                token_starts = batch_token_starts[i]
                hpd = []
                lpd = []
                for j in range(len(token_starts)):
                    if j == 0:  #[CLS]
                        continue
                    elif j == len(token_starts) - 1:  # [SEP]
                        continue
                    else:
                        hpd.append(word_index[heads_pred[i, token_starts[j]]])
                        lpd.append(label_vocab[labels_pred[i,
                                                           token_starts[j]]])
                pred_heads.append(hpd)
                pred_labels.append(lpd)

            test_predict_words += batch_words
            test_predict_postags += batch_postags
            test_predict_heads += pred_heads
            test_predict_labels += pred_labels

        assert args.test_output is not None
        write_conll_examples(test_predict_words, test_predict_postags,
                             test_predict_heads, test_predict_labels,
                             args.test_output)

    if args.do_ensemble_predict:
        assert args.test_file is not None
        test_examples = read_conll_examples(args.test_file,
                                            is_training=False,
                                            has_confidence=False)
        test_features = convert_examples_to_features(test_examples,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     label_vocab2idx,
                                                     False,
                                                     has_confidence=False)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_example_ids = torch.tensor([f.example_id for f in test_features],
                                       dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in test_features],
                                   dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_lengths,
                                  all_example_ids)

        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)

        test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[]
        for batch_id, batch in enumerate(
                tqdm(test_dataloader, desc="Predicting")):
            input_ids, input_mask, segment_ids, lengths, example_ids = batch
            example_ids = example_ids.numpy()
            batch_words = [
                test_features[eid].example.sentence for eid in example_ids
            ]
            batch_postags = [
                test_features[eid].example.postags for eid in example_ids
            ]
            batch_word_index = [
                test_features[eid].word_index for eid in example_ids
            ]  # token -> word
            batch_token_starts = [
                test_features[eid].token_starts for eid in example_ids
            ]  # word -> token start

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            lengths = lengths.numpy()

            with torch.no_grad():
                energy_sum = None
                for model in models:
                    energy = model(input_ids, segment_ids, input_mask)
                    if energy_sum is None:
                        energy_sum = energy
                    else:
                        energy_sum = energy_sum + energy

                energy_sum = energy_sum / len(models)

            heads_pred, labels_pred = parser.decode_MST(
                energy_sum.cpu().numpy(),
                lengths,
                leading_symbolic=0,
                labeled=True)

            pred_heads = []
            pred_labels = []
            for i in range(len(batch_word_index)):
                word_index = batch_word_index[i]
                token_starts = batch_token_starts[i]
                hpd = []
                lpd = []
                for j in range(len(token_starts)):
                    if j == 0:  #[CLS]
                        continue
                    elif j == len(token_starts) - 1:  # [SEP]
                        continue
                    else:
                        hpd.append(word_index[heads_pred[i, token_starts[j]]])
                        lpd.append(label_vocab[labels_pred[i,
                                                           token_starts[j]]])
                pred_heads.append(hpd)
                pred_labels.append(lpd)

            test_predict_words += batch_words
            test_predict_postags += batch_postags
            test_predict_heads += pred_heads
            test_predict_labels += pred_labels

        assert args.test_output is not None
        write_conll_examples(test_predict_words, test_predict_postags,
                             test_predict_heads, test_predict_labels,
                             args.test_output)