Exemple #1
0
def main(corpus_dir,
         corpus_name,
         model_dir,
         trained_model_savedir,
         create_tokenizer=False,
         train_model_name='gpt2',
         train_spm=True,
         save_tokenized=False,
         dotraining=False,
         model_name=None,
         resume=False,
         vocab_name='vocab',
         resume_iters=0,
         spm_vocab_size=2000,
         spm_max_sentence_length=4098,
         spm_model_name='spm_id',
         block_size=512,
         spm_model_type='unigram',
         train_batch_size=1,
         num_epoch=1000,
         fp16=False,
         do_lower_case=True,
         trained_tensor_name='bert_traintensor_wikiall',
         tensor_from_pretrained=False):
    ###################################################################################
    # set torch device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device("cpu")
    n_gpu = torch.cuda.device_count()

    set_seed(seed=1332, n_gpu=n_gpu)

    num_epoch = num_epoch
    max_grad_norm = 1.0
    gradient_accumulation_steps = 5
    warmup_steps = 30

    tr_loss, logging_loss = 0.0, 0.0

    logging_steps = 50
    max_steps = 1000

    mlm_probability = 0.15
    local_rank = -1
    train_batch_size = train_batch_size
    block_size = block_size

    if not tensor_from_pretrained:
        ## create cache of training dataset
        train_dataset, tokenizer = bertDataProcessing(
            corpus_dir,
            corpus_name,
            model_dir,
            "{}.model".format(spm_model_name),
            "{}.vocab".format(spm_model_name),
            do_lower_case=True,
            save_filename='bert_traintensor_wikiall',
            save_directory="./")
    else:
        train_dataset, tokenizer = bertDataLoader(
            corpus_dir,
            tokenizer_dir=model_dir,
            trained_tensor_name=trained_tensor_name,
            spm_model_name="{}.model".format(spm_model_name),
            spm_vocab_name="{}.vocab".format(spm_model_name),
            do_lower_case=do_lower_case)

    if dotraining:
        dataset = train_dataset
        print("Loading train_dataset done...")

        if max_steps > 0:
            t_total = max_steps
        else:
            t_total = len(dataset) // gradient_accumulation_steps * num_epoch
        print("t_total: {}".format(t_total))

        config = BertConfig(vocab_size_or_config_json_file=spm_vocab_size)
        model = BertModel(config)

        ## resume iters:
        if resume:
            model = restoreModel(model,
                                 resume_iters=resume_iters,
                                 model_name=model_name,
                                 model_save_dir=model_dir +
                                 trained_model_savedir,
                                 base_model_prefix='bert')

        model.to(device)

        num_params = 0
        for p in model.parameters():
            num_params += p.numel()
        print(model)
        print("The number of model_parameters: {}".format(num_params))

        weight_decay = 0.1
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=0.00025, eps=1e-8)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)

        doTraining(model,
                   config,
                   train_dataset,
                   tokenizer,
                   optimizer,
                   scheduler,
                   tr_loss,
                   logging_loss,
                   gradient_accumulation_steps,
                   mlm_probability,
                   device,
                   local_rank,
                   train_batch_size,
                   n_gpu=n_gpu,
                   num_epoch=num_epoch,
                   start_iters=resume_iters,
                   max_grad_norm=max_grad_norm,
                   fp16=fp16,
                   logging_steps=logging_steps,
                   save_dir=model_dir + trained_model_savedir,
                   train_model_name=train_model_name)
Exemple #2
0
def main(corpus_dir,
         corpus_name,
         model_dir,
         trained_model_savedir,
         create_tokenizer=False,
         train_model_name='gpt2',
         train_spm=True,
         save_tokenized=False,
         dotraining=False,
         model_name=None,
         resume=False,
         vocab_name='vocab',
         resume_iters=0,
         spm_vocab_size=2000,
         spm_max_sentence_length=4098,
         spm_model_name='spm_id',
         block_size=512,
         spm_model_type='unigram',
         is_finetune=False,
         from_pretrained=False,
         train_batch_size=1,
         num_epoch=1000,
         fp16=False):
    ###################################################################################
    # set torch device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device("cpu")
    n_gpu = torch.cuda.device_count()

    set_seed(seed=1332, n_gpu=n_gpu)

    num_epoch = num_epoch
    max_grad_norm = 1.0
    gradient_accumulation_steps = 50
    warmup_steps = 500

    tr_loss, logging_loss = 0.0, 0.0

    logging_steps = 50
    max_steps = -1

    mlm_probability = 0.15
    local_rank = -1
    train_batch_size = train_batch_size
    block_size = block_size

    ## loading tokenizer
    tokenizer = TokenizerId(spm_vocab_size=spm_vocab_size)

    ## prepare dataset
    _dataset = corpus_dir + corpus_name
    if create_tokenizer:
        data_list = ['<unk>', '<sep>', '<cls>']
        with open(_dataset, encoding="utf-8") as fp:
            line = fp.readline()
            while line:
                line = fp.readline()
                data_list.append(line)
        tokenizer.createVocab(data_list,
                              spm_text_file=_dataset,
                              data_dir=model_dir,
                              train_spm=train_spm,
                              spm_max_sentence_length=spm_max_sentence_length,
                              spm_model_name=spm_model_name,
                              spm_model_type=spm_model_type)
    else:
        tokenizer.from_pretrained(
            model_dir,
            use_spm=train_spm,
            spm_model_name=spm_model_name,
            spm_max_sentence_length=spm_max_sentence_length,
            std_vocab_name=vocab_name)
    print("tokenizer.vocab_size: {}".format(tokenizer.vocab_size))

    ## saving tokenized object for consistent use
    if save_tokenized:
        tokenizer.save_pretrained(model_dir, vocab_name=vocab_name)

    ## create cache of training dataset
    train_dataset = loadAndCacheExamples(_dataset,
                                         block_size,
                                         tokenizer,
                                         evaluate=False,
                                         use_spm=train_spm)

    if dotraining:
        dataset = train_dataset
        print("Loading train_dataset done...")

        if max_steps > 0:
            t_total = max_steps
        else:
            t_total = len(dataset) // gradient_accumulation_steps * num_epoch
        print("t_total: {}".format(t_total))

        ## Prepare model and training
        models = [
            (GPT2LMHeadModel, GPT2Config),
            #(XLNetModel, XLNetConfig)
            (XLNetLMHeadModel, XLNetConfig)
        ]
        config = models[0][1](
            vocab_size_or_config_json_file=tokenizer.vocab_size)
        model = models[0][0](config)

        ## resume iters:
        if resume:
            model = restoreModel(model,
                                 resume_iters=resume_iters,
                                 model_name=model_name,
                                 model_save_dir=model_dir +
                                 trained_model_savedir,
                                 is_finetune=is_finetune,
                                 from_pretrained=from_pretrained)

        model.to(device)

        num_params = 0
        for p in model.parameters():
            num_params += p.numel()
        print(model)
        print("The number of model_parameters: {}".format(num_params))

        weight_decay = 0.1
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=0.00025, eps=1e-8)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)

        doTraining(model,
                   train_dataset,
                   tokenizer,
                   optimizer,
                   scheduler,
                   tr_loss,
                   logging_loss,
                   gradient_accumulation_steps,
                   mlm_probability,
                   device,
                   local_rank,
                   train_batch_size,
                   num_epoch=num_epoch,
                   start_iters=resume_iters,
                   max_grad_norm=max_grad_norm,
                   fp16=fp16,
                   logging_steps=logging_steps,
                   save_dir=model_dir + trained_model_savedir,
                   train_model_name=train_model_name)
def main(corpus_dir,
         corpus_name,
         model_dir,
         trained_model_savedir,
         create_tokenizer=False,
         train_model_name='gpt2',
         train_spm=True,
         save_tokenized=False,
         dotraining=False,
         model_name=None,
         resume=False,
         vocab_name='vocab',
         resume_iters=0,
         spm_vocab_size=2000,
         spm_max_sentence_length=4098,
         spm_model_name='spm_id',
         block_size=512,
         spm_model_type='unigram',
         train_batch_size=1,
         num_epoch=1000,
         fp16=False):
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--corpus_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The directory where the corpus located.")
    parser.add_argument("--corpus_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The input training data file (a text file).")
    parser.add_argument(
        "--model_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--trained_model_savedir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--create_tokenizer",
        action='store_true',
        help=
        "Whether to create tokenizer object or not, default is False, means you already have tokenizer trained model."
    )
    parser.add_argument(
        "--save_tokenized",
        action='store_true',
        help=
        "if --create_tokenizer is True, whether you want to save the model or not, default is False"
    )
    parser.add_argument(
        "--train_spm",
        action='store_true',
        help=
        "Whether to create sentencepiece object or not, default is False, means you already have sentencepiece trained model."
    )

    parser.add_argument("--dotraining",
                        action='store_true',
                        help="Whether to execute the training gpt2 or not")
    parser.add_argument(
        "--resume",
        action='store_true',
        help="Whether to resume training from selected epoch or not.")
    parser.add_argument(
        "--resume_iters",
        default=0,
        type=int,
        help="Specify what is choosen starting epoch for --resume training.")

    parser.add_argument("--train_model_name",
                        default="gpt2",
                        type=str,
                        help="Name of trained model when saving/dump.")
    parser.add_argument("--model_name",
                        default=None,
                        type=str,
                        help="Specify model name to use on --resume training")
    parser.add_argument(
        "--vocab_name",
        default="vocab",
        type=str,
        help="Specify vocab model name from process --create_tokenizer")

    parser.add_argument("--spm_vocab_size",
                        default=2000,
                        type=int,
                        help="Specify sentencepiece vocab size")
    parser.add_argument(
        "--spm_max_sentence_length",
        default=4098,
        type=int,
        help=
        "Specify sentencepiece max_sentence_length that use for training. Only used when firttime training the spm model"
    )
    parser.add_argument("--spm_model_name",
                        default="spm_id",
                        type=str,
                        help="Specify spm model name for saving ")
    parser.add_argument(
        "--spm_model_type",
        default="unigram",
        type=str,
        help=
        "specify method that u want to use as sub-word method in training spm object. Available is: unigram, word and byte-pair-encoding(bpe)"
    )

    parser.add_argument(
        "--block_size",
        default=512,
        type=int,
        help="Specify block_size for GPT2 configuration networks")
    parser.add_argument("--train_batch_size",
                        default=2,
                        type=int,
                        help="batch size used during pre-training")
    parser.add_argument("--num_epoch",
                        default=1000,
                        type=int,
                        help="number opoch to iterate for training")
    parser.add_argument("--fp16",
                        action='store_true',
                        help="Wheter to use apex for training or not!")
    args = parser.parse_args()

    ###################################################################################
    # set torch device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device("cpu")
    n_gpu = torch.cuda.device_count()

    set_seed(seed=1332, n_gpu=n_gpu)

    num_epoch = num_epoch
    max_grad_norm = 1.0
    gradient_accumulation_steps = 50
    warmup_steps = 500

    tr_loss, logging_loss = 0.0, 0.0

    logging_steps = 50
    max_steps = 1000

    mlm_probability = 0.15
    local_rank = -1
    train_batch_size = train_batch_size
    block_size = block_size

    ## loading tokenizer
    tokenizer = TokenizerId(spm_vocab_size=spm_vocab_size)

    ## prepare dataset
    _dataset = corpus_dir + corpus_name
    if create_tokenizer:
        data_list = ['<unk>', '<sep>', '<cls>', '<mask>']
        with open(_dataset, encoding="utf-8") as fp:
            line = fp.readline()
            while line:
                line = fp.readline()
                data_list.append(line)
        tokenizer.createVocab(data_list,
                              spm_text_file=_dataset,
                              data_dir=model_dir,
                              train_spm=train_spm,
                              spm_max_sentence_length=spm_max_sentence_length,
                              spm_model_name=spm_model_name,
                              spm_model_type=spm_model_type)
    else:
        tokenizer.from_pretrained(
            model_dir,
            use_spm=train_spm,
            spm_model_name=spm_model_name,
            spm_max_sentence_length=spm_max_sentence_length,
            std_vocab_name=vocab_name)
    print("tokenizer.vocab_size: {}".format(tokenizer.vocab_size))

    ## saving tokenized object for consistent use
    if save_tokenized:
        tokenizer.save_pretrained(model_dir, vocab_name=vocab_name)

    ## create cache of training dataset
    train_dataset = loadAndCacheExamples(_dataset,
                                         block_size,
                                         tokenizer,
                                         evaluate=False,
                                         use_spm=train_spm)

    if dotraining:
        dataset = train_dataset
        print("Loading train_dataset done...")

        if max_steps > 0:
            t_total = max_steps
        else:
            t_total = len(dataset) // gradient_accumulation_steps * num_epoch
        print("t_total: {}".format(t_total))

        config = GPT2Config(
            vocab_size_or_config_json_file=tokenizer.vocab_size)

        # prepare output_attentions and hidden_states
        config.output_hidden_states = True

        model = GPT2Model(config)

        ## resume iters:
        if resume:
            model = restoreModel(model,
                                 resume_iters=resume_iters,
                                 model_name=model_name,
                                 model_save_dir=model_dir +
                                 trained_model_savedir,
                                 base_model_prefix='gpt2')

        model.to(device)

        num_params = 0
        for p in model.parameters():
            num_params += p.numel()
        print(model)
        print("The number of model_parameters: {}".format(num_params))

        weight_decay = 0.1
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=0.00025, eps=1e-8)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)

        doTraining(model,
                   config,
                   train_dataset,
                   tokenizer,
                   optimizer,
                   scheduler,
                   tr_loss,
                   logging_loss,
                   gradient_accumulation_steps,
                   mlm_probability,
                   device,
                   local_rank,
                   train_batch_size,
                   n_gpu=n_gpu,
                   num_epoch=num_epoch,
                   start_iters=resume_iters,
                   max_grad_norm=max_grad_norm,
                   fp16=fp16,
                   logging_steps=logging_steps,
                   save_dir=model_dir + trained_model_savedir,
                   train_model_name=train_model_name)
                        num_labels=num_labels,
                        hidden_size=600,
                        num_attention_heads=12,
                        intermediate_size=2048)

    #model = Ner(config)
    model = BertForTokenClassification(config)
    if eval_only:
        model_name = model_eval_name
        from_pretrained = False
        print("Loading evaluation only with model name: {}".format(model_name))

    model = restoreModel(model,
                         resume_iters=start_iters,
                         model_name=model_name,
                         model_save_dir=model_dir + trained_model_savedir,
                         base_model_prefix='bert',
                         from_pretrained=from_pretrained,
                         is_finetune=is_finetune)

    model.to(device)

    num_params = 0
    for p in model.parameters():
        num_params += p.numel()
    print(model)
    print("The number of model_parameters: {}".format(num_params))

    weight_decay = 0.1
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{