Exemple #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument(
        "--max_choices_num",
        default=4,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )

    tokenizer_opts(parser)

    adv_opts(parser)

    args = parser.parse_args()
    args.labels_num = args.max_choices_num

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build multiple choice model.
    model = MultipleChoice(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    # Get logger.
    args.logger = init_logger(args)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path)
    instances_num = len(trainset)
    batch_size = args.batch_size

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    args.logger.info("Batch size: {}".format(batch_size))
    args.logger.info(
        "The number of training instances: {}".format(instances_num))

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        args.logger.info("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    if args.use_adv:
        args.adv_method = str2adv[args.adv_type](model)

    total_loss, result, best_result = 0.0, 0.0, 0.0

    args.logger.info("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        random.shuffle(trainset)
        src = torch.LongTensor([example[0] for example in trainset])
        tgt = torch.LongTensor([example[1] for example in trainset])
        seg = torch.LongTensor([example[2] for example in trainset])

        model.train()
        for i, (src_batch, tgt_batch, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch)
            total_loss += loss.item()

            if (i + 1) % args.report_steps == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        result = evaluate(args, read_dataset(args, args.dev_path))
        if result[0] > best_result:
            best_result = result[0]
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        args.logger.info("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            args.model.module.load_state_dict(
                torch.load(args.output_model_path))
        else:
            args.model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path))
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build machine reading comprehension model.
    model = MachineReadingComprehension(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    # Get logger.
    args.logger = init_logger(args)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Training phase.
    batch_size = args.batch_size
    args.logger.info("Batch size: {}".format(batch_size))
    trainset, _ = read_dataset(args, args.train_path)
    instances_num = len(trainset)

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    args.logger.info(
        "The number of training instances: {}".format(instances_num))

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if torch.cuda.device_count() > 1:
        args.logger.info("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss = 0.0
    result = 0.0
    best_result = 0.0

    args.logger.info("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        random.shuffle(trainset)
        src = torch.LongTensor([sample[0] for sample in trainset])
        seg = torch.LongTensor([sample[1] for sample in trainset])
        start_position = torch.LongTensor([sample[2] for sample in trainset])
        end_position = torch.LongTensor([sample[3] for sample in trainset])

        model.train()

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            loss = train(args, model, optimizer, scheduler, src_batch,
                         seg_batch, start_position_batch, end_position_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        result = evaluate(args, *read_dataset(args, args.dev_path))
        if result > best_result:
            best_result = result
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        args.logger.info("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            args.model.module.load_state_dict(
                torch.load(args.output_model_path))
        else:
            args.model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, *read_dataset(args, args.test_path))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--dataset_path_list",
                        default=[],
                        nargs='+',
                        type=str,
                        help="Dataset path list.")
    parser.add_argument("--output_model_path",
                        default="models/multitask_classifier_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--config_path",
                        default="models/bert/base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    model_opts(parser)
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Tokenizer options.
    tokenizer_opts(parser)

    # Optimizer options.
    optimization_opts(parser)

    # Training options.
    training_opts(parser)

    adv_opts(parser)

    args = parser.parse_args()

    args.soft_targets = False

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num_list = [
        count_labels_num(os.path.join(path, "train.tsv"))
        for path in args.dataset_path_list
    ]

    args.datasets_num = len(args.dataset_path_list)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build multi-task classification model.
    model = MultitaskClassifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    # Get logger.
    args.logger = init_logger(args)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)
    args.model = model

    if args.use_adv:
        args.adv_method = str2adv[args.adv_type](model)

    # Training phase.
    dataset_list = [
        read_dataset(args, os.path.join(path, "train.tsv"))
        for path in args.dataset_path_list
    ]
    packed_dataset_list = [
        pack_dataset(dataset, i, args.batch_size)
        for i, dataset in enumerate(dataset_list)
    ]

    packed_dataset_all = []
    for packed_dataset in packed_dataset_list:
        packed_dataset_all += packed_dataset

    instances_num = sum([len(dataset) for dataset in dataset_list])
    batch_size = args.batch_size

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    args.logger.info("Batch size: {}".format(batch_size))
    args.logger.info(
        "The number of training instances: {}".format(instances_num))

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        args.logger.info("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    total_loss, result, best_result = 0.0, 0.0, 0.0

    args.logger.info("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        random.shuffle(packed_dataset_all)

        model.train()
        for i, (dataset_id, src_batch, tgt_batch,
                seg_batch) in enumerate(packed_dataset_all):
            if hasattr(model, "module"):
                model.module.change_dataset(dataset_id)
            else:
                model.change_dataset(dataset_id)
            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch, None)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        for dataset_id, path in enumerate(args.dataset_path_list):
            args.labels_num = args.labels_num_list[dataset_id]
            if hasattr(model, "module"):
                model.module.change_dataset(dataset_id)
            else:
                model.change_dataset(dataset_id)
            result = evaluate(
                args, read_dataset(args, os.path.join(path, "dev.tsv")))

    save_model(model, args.output_model_path)
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    tokenizer_opts(parser)

    parser.add_argument("--temperature", type=float, default=0.05)
    parser.add_argument("--eval_steps",
                        type=int,
                        default=200,
                        help="Evaluate frequency.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model.
    model = SimCSE(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    # Get logger.
    args.logger = init_logger(args)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path)
    instances_num = len(trainset)
    batch_size = args.batch_size

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    args.logger.info("Batch size: {}".format(batch_size))
    args.logger.info(
        "The number of training instances: {}".format(instances_num))

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        args.logger.info("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, result, best_result = 0.0, 0.0, 0.0

    args.logger.info("Start training.")
    for epoch in range(1, args.epochs_num + 1):
        random.shuffle(trainset)
        src_a = torch.LongTensor([example[0][0] for example in trainset])
        src_b = torch.LongTensor([example[0][1] for example in trainset])
        tgt = torch.FloatTensor([example[1] for example in trainset])
        seg_a = torch.LongTensor([example[2][0] for example in trainset])
        seg_b = torch.LongTensor([example[2][1] for example in trainset])

        model.train()
        for i, (src_batch, tgt_batch, seg_batch) in enumerate(
                batch_loader(batch_size, (src_a, src_b), tgt, (seg_a, seg_b))):
            model.zero_grad()

            src_a_batch, src_b_batch = src_batch
            seg_a_batch, seg_b_batch = seg_batch

            src_a_batch = src_a_batch.to(args.device)
            src_b_batch = src_b_batch.to(args.device)

            seg_a_batch = seg_a_batch.to(args.device)
            seg_b_batch = seg_b_batch.to(args.device)

            features_0, features_1 = model((src_a_batch, src_b_batch),
                                           (seg_a_batch, seg_b_batch))

            similarity_matrix = similarity(features_0, features_1,
                                           args.temperature)
            tgt_batch = torch.arange(similarity_matrix.size(0),
                                     device=similarity_matrix.device,
                                     dtype=torch.long)
            loss = nn.CrossEntropyLoss()(similarity_matrix, tgt_batch)

            if args.fp16:
                with args.amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

            if (i + 1) % args.eval_steps == 0 or (i + 1) == math.ceil(
                    instances_num / batch_size):
                result = evaluate(args, read_dataset(args, args.dev_path))
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Evaluate result: {}, Best result: {}"
                    .format(epoch, i + 1, result, best_result))
                if result > best_result:
                    best_result = result
                    save_model(model, args.output_model_path)
                    args.logger.info(
                        "It is the best model until now. Save it to {}".format(
                            args.output_model_path))
Exemple #5
0
def worker(proc_id, gpu_ranks, args, model):
    """
    Args:
        proc_id: The id of GPU for single GPU mode;
                 The id of process (and GPU) for multiprocessing distributed mode.
        gpu_ranks: List of ranks of each process.
    """
    set_seed(args.seed)

    # Get logger
    args.logger = init_logger(args)

    if args.deepspeed:
        import deepspeed
        deepspeed.init_distributed(dist_backend=args.backend)
        rank = dist.get_rank()
        gpu_id = proc_id
    elif args.dist_train:
        rank = gpu_ranks[proc_id]
        gpu_id = proc_id
    elif args.single_gpu:
        rank = None
        gpu_id = proc_id
    else:
        rank = None
        gpu_id = None

    if args.dist_train:
        train_loader = str2dataloader[args.data_processor](
            args, args.dataset_path, args.batch_size, rank, args.world_size,
            True)
    else:
        train_loader = str2dataloader[args.data_processor](args,
                                                           args.dataset_path,
                                                           args.batch_size, 0,
                                                           1, True)

    # Build optimizer.
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.01
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    if args.optimizer in ["adamw"]:
        custom_optimizer = str2optimizer[args.optimizer](
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            correct_bias=False)
    else:
        custom_optimizer = str2optimizer[args.optimizer](
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            scale_parameter=False,
            relative_step=False)
    if args.scheduler in ["constant"]:
        custom_scheduler = str2scheduler[args.scheduler](custom_optimizer)
    elif args.scheduler in ["constant_with_warmup"]:
        custom_scheduler = str2scheduler[args.scheduler](
            custom_optimizer, args.total_steps * args.warmup)
    else:
        custom_scheduler = str2scheduler[args.scheduler](
            custom_optimizer, args.total_steps * args.warmup, args.total_steps)

    if args.deepspeed:
        model, optimizer, _, scheduler = deepspeed.initialize(
            model=model,
            model_parameters=optimizer_grouped_parameters,
            args=args,
            optimizer=custom_optimizer,
            lr_scheduler=custom_scheduler,
            mpu=None,
            dist_init_required=False)
    else:
        if gpu_id is not None:
            model.cuda(gpu_id)
        optimizer = custom_optimizer
        scheduler = custom_scheduler
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)
            args.amp = amp

        if args.dist_train:
            # Initialize multiprocessing distributed training environment.
            dist.init_process_group(backend=args.backend,
                                    init_method=args.master_ip,
                                    world_size=args.world_size,
                                    rank=rank)
            model = DistributedDataParallel(model,
                                            device_ids=[gpu_id],
                                            find_unused_parameters=True)
            args.logger.info("Worker %d is training ... " % rank)
        else:
            args.logger.info("Worker is training ...")

    trainer = str2trainer[args.data_processor](args)
    trainer.train(args, gpu_id, rank, train_loader, model, optimizer,
                  scheduler)
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--label2id_path",
                        type=str,
                        required=True,
                        help="Path of the label2id file.")
    parser.add_argument(
        "--crf_target",
        action="store_true",
        help="Use CRF loss as the target function or not, default False.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    # Get logger.
    args.logger = init_logger(args)

    set_seed(args.seed)

    args.begin_ids = []

    with open(args.label2id_path, mode="r", encoding="utf-8") as f:
        l2i = json.load(f)
        args.logger.info("Labels: ", l2i)
        l2i["[PAD]"] = len(l2i)
        for label in l2i:
            if label.startswith("B"):
                args.begin_ids.append(l2i[label])

    args.l2i = l2i

    args.labels_num = len(l2i)

    args.tokenizer = SpaceTokenizer(args)

    # Build sequence labeling model.
    model = NerTagger(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    instances = read_dataset(args, args.train_path)
    instances_num = len(instances)
    batch_size = args.batch_size
    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    args.logger.info("Batch size: {}".format(batch_size))
    args.logger.info(
        "The number of training instances: {}".format(instances_num))

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if torch.cuda.device_count() > 1:
        args.logger.info("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, f1, best_f1 = 0.0, 0.0, 0.0

    args.logger.info("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        random.shuffle(instances)
        src = torch.LongTensor([ins[0] for ins in instances])
        tgt = torch.LongTensor([ins[1] for ins in instances])
        seg = torch.LongTensor([ins[2] for ins in instances])

        model.train()
        for i, (src_batch, tgt_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, tgt,
                                                     seg)):
            loss = train(args, model, optimizer, scheduler, src_batch,
                         tgt_batch, seg_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        f1 = evaluate(args, read_dataset(args, args.dev_path))
        if f1 > best_f1:
            best_f1 = f1
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        args.logger.info("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            args.model.module.load_state_dict(
                torch.load(args.output_model_path))
        else:
            args.model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path))
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--dataset_path",
                        type=str,
                        default="dataset.pt",
                        help="Path of the preprocessed dataset.")
    parser.add_argument("--pretrained_model_path",
                        type=str,
                        default=None,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        type=str,
                        required=True,
                        help="Path of the output model.")
    parser.add_argument("--config_path",
                        type=str,
                        default="models/bert/base_config.json",
                        help="Config file of model hyper-parameters.")

    # Training and saving options.
    parser.add_argument("--total_steps",
                        type=int,
                        default=100000,
                        help="Total training steps.")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=10000,
                        help="Specific steps to save model checkpoint.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--accumulation_steps",
                        type=int,
                        default=1,
                        help="Specific steps to accumulate gradient.")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help=
        "Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps]."
    )
    parser.add_argument("--instances_buffer_size",
                        type=int,
                        default=25600,
                        help="The buffer size of instances in memory.")
    parser.add_argument("--labels_num",
                        type=int,
                        required=False,
                        help="Number of prediction labels.")
    parser.add_argument("--dropout",
                        type=float,
                        default=0.1,
                        help="Dropout value.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    # Preprocess options.
    tokenizer_opts(parser)
    tgt_tokenizer_opts(parser)

    # Model options.
    model_opts(parser)
    parser.add_argument(
        "--tgt_embedding",
        choices=["word", "word_pos", "word_pos_seg", "word_sinusoidalpos"],
        default="word_pos_seg",
        help="Target embedding type.")
    parser.add_argument("--decoder",
                        choices=["transformer"],
                        default="transformer",
                        help="Decoder type.")
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")
    parser.add_argument("--target",
                        choices=[
                            "bert", "lm", "mlm", "bilm", "albert", "seq2seq",
                            "t5", "cls", "prefixlm", "gsg", "bart"
                        ],
                        default="bert",
                        help="The training target of the pretraining model.")
    parser.add_argument("--tie_weights",
                        action="store_true",
                        help="Tie the word embedding and softmax weights.")
    parser.add_argument("--has_lmtarget_bias",
                        action="store_true",
                        help="Add bias on output_layer for lm target.")
    parser.add_argument(
        "--deep_init",
        action="store_true",
        help="Scaling initialization of projection layers by a "
        "factor of 1/sqrt(2N). Necessary to large models.")

    # Masking options.
    parser.add_argument("--whole_word_masking",
                        action="store_true",
                        help="Whole word masking.")
    parser.add_argument("--span_masking",
                        action="store_true",
                        help="Span masking.")
    parser.add_argument(
        "--span_geo_prob",
        type=float,
        default=0.2,
        help="Hyperparameter of geometric distribution for span masking.")
    parser.add_argument("--span_max_length",
                        type=int,
                        default=10,
                        help="Max length for span masking.")

    # Optimizer options.
    optimization_opts(parser)

    # GPU options.
    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="Total number of processes (GPUs) for training.")
    parser.add_argument(
        "--gpu_ranks",
        default=[],
        nargs='+',
        type=int,
        help="List of ranks of each process."
        " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU."
    )
    parser.add_argument("--master_ip",
                        default="tcp://localhost:12345",
                        type=str,
                        help="IP-Port of master for training.")
    parser.add_argument("--backend",
                        choices=["nccl", "gloo"],
                        default="nccl",
                        type=str,
                        help="Distributed backend.")

    # Deepspeed options.
    deepspeed_opts(parser)

    # Log options.
    log_opts(parser)

    args = parser.parse_args()

    if args.target == "cls":
        assert args.labels_num is not None, "Cls target needs the denotation of the number of labels."

    # Load hyper-parameters from config file.
    if args.config_path:
        args = load_hyperparam(args)

    # Get logger
    args.logger = init_logger(args)

    ranks_num = len(args.gpu_ranks)

    if args.deepspeed:
        if args.world_size > 1:
            args.dist_train = True
        else:
            args.dist_train = False
    else:
        if args.world_size > 1:
            # Multiprocessing distributed mode.
            assert torch.cuda.is_available(), "No available GPUs."
            assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit."
            assert ranks_num <= torch.cuda.device_count(
            ), "Started processes exceeds the available GPUs."
            args.dist_train = True
            args.ranks_num = ranks_num
            args.logger.info("Using distributed mode for training.")
        elif args.world_size == 1 and ranks_num == 1:
            # Single GPU mode.
            assert torch.cuda.is_available(), "No available GPUs."
            args.gpu_id = args.gpu_ranks[0]
            assert args.gpu_id < torch.cuda.device_count(
            ), "Invalid specified GPU device."
            args.dist_train = False
            args.single_gpu = True
            args.logger.info("Using GPU %d for training." % args.gpu_id)
        else:
            # CPU mode.
            assert ranks_num == 0, "GPUs are specified, please check the arguments."
            args.dist_train = False
            args.single_gpu = False
            args.logger.info("Using CPU mode for training.")

    trainer.train_and_validate(args)