def main(corpus_dir, corpus_name, model_dir, trained_model_savedir, create_tokenizer=False, train_model_name='gpt2', train_spm=True, save_tokenized=False, dotraining=False, model_name=None, resume=False, vocab_name='vocab', resume_iters=0, spm_vocab_size=2000, spm_max_sentence_length=4098, spm_model_name='spm_id', block_size=512, spm_model_type='unigram', train_batch_size=1, num_epoch=1000, fp16=False, do_lower_case=True, trained_tensor_name='bert_traintensor_wikiall', tensor_from_pretrained=False): ################################################################################### # set torch device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device("cpu") n_gpu = torch.cuda.device_count() set_seed(seed=1332, n_gpu=n_gpu) num_epoch = num_epoch max_grad_norm = 1.0 gradient_accumulation_steps = 5 warmup_steps = 30 tr_loss, logging_loss = 0.0, 0.0 logging_steps = 50 max_steps = 1000 mlm_probability = 0.15 local_rank = -1 train_batch_size = train_batch_size block_size = block_size if not tensor_from_pretrained: ## create cache of training dataset train_dataset, tokenizer = bertDataProcessing( corpus_dir, corpus_name, model_dir, "{}.model".format(spm_model_name), "{}.vocab".format(spm_model_name), do_lower_case=True, save_filename='bert_traintensor_wikiall', save_directory="./") else: train_dataset, tokenizer = bertDataLoader( corpus_dir, tokenizer_dir=model_dir, trained_tensor_name=trained_tensor_name, spm_model_name="{}.model".format(spm_model_name), spm_vocab_name="{}.vocab".format(spm_model_name), do_lower_case=do_lower_case) if dotraining: dataset = train_dataset print("Loading train_dataset done...") if max_steps > 0: t_total = max_steps else: t_total = len(dataset) // gradient_accumulation_steps * num_epoch print("t_total: {}".format(t_total)) config = BertConfig(vocab_size_or_config_json_file=spm_vocab_size) model = BertModel(config) ## resume iters: if resume: model = restoreModel(model, resume_iters=resume_iters, model_name=model_name, model_save_dir=model_dir + trained_model_savedir, base_model_prefix='bert') model.to(device) num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print("The number of model_parameters: {}".format(num_params)) weight_decay = 0.1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=0.00025, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) doTraining(model, config, train_dataset, tokenizer, optimizer, scheduler, tr_loss, logging_loss, gradient_accumulation_steps, mlm_probability, device, local_rank, train_batch_size, n_gpu=n_gpu, num_epoch=num_epoch, start_iters=resume_iters, max_grad_norm=max_grad_norm, fp16=fp16, logging_steps=logging_steps, save_dir=model_dir + trained_model_savedir, train_model_name=train_model_name)
def main(corpus_dir, corpus_name, model_dir, trained_model_savedir, create_tokenizer=False, train_model_name='gpt2', train_spm=True, save_tokenized=False, dotraining=False, model_name=None, resume=False, vocab_name='vocab', resume_iters=0, spm_vocab_size=2000, spm_max_sentence_length=4098, spm_model_name='spm_id', block_size=512, spm_model_type='unigram', is_finetune=False, from_pretrained=False, train_batch_size=1, num_epoch=1000, fp16=False): ################################################################################### # set torch device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device("cpu") n_gpu = torch.cuda.device_count() set_seed(seed=1332, n_gpu=n_gpu) num_epoch = num_epoch max_grad_norm = 1.0 gradient_accumulation_steps = 50 warmup_steps = 500 tr_loss, logging_loss = 0.0, 0.0 logging_steps = 50 max_steps = -1 mlm_probability = 0.15 local_rank = -1 train_batch_size = train_batch_size block_size = block_size ## loading tokenizer tokenizer = TokenizerId(spm_vocab_size=spm_vocab_size) ## prepare dataset _dataset = corpus_dir + corpus_name if create_tokenizer: data_list = ['<unk>', '<sep>', '<cls>'] with open(_dataset, encoding="utf-8") as fp: line = fp.readline() while line: line = fp.readline() data_list.append(line) tokenizer.createVocab(data_list, spm_text_file=_dataset, data_dir=model_dir, train_spm=train_spm, spm_max_sentence_length=spm_max_sentence_length, spm_model_name=spm_model_name, spm_model_type=spm_model_type) else: tokenizer.from_pretrained( model_dir, use_spm=train_spm, spm_model_name=spm_model_name, spm_max_sentence_length=spm_max_sentence_length, std_vocab_name=vocab_name) print("tokenizer.vocab_size: {}".format(tokenizer.vocab_size)) ## saving tokenized object for consistent use if save_tokenized: tokenizer.save_pretrained(model_dir, vocab_name=vocab_name) ## create cache of training dataset train_dataset = loadAndCacheExamples(_dataset, block_size, tokenizer, evaluate=False, use_spm=train_spm) if dotraining: dataset = train_dataset print("Loading train_dataset done...") if max_steps > 0: t_total = max_steps else: t_total = len(dataset) // gradient_accumulation_steps * num_epoch print("t_total: {}".format(t_total)) ## Prepare model and training models = [ (GPT2LMHeadModel, GPT2Config), #(XLNetModel, XLNetConfig) (XLNetLMHeadModel, XLNetConfig) ] config = models[0][1]( vocab_size_or_config_json_file=tokenizer.vocab_size) model = models[0][0](config) ## resume iters: if resume: model = restoreModel(model, resume_iters=resume_iters, model_name=model_name, model_save_dir=model_dir + trained_model_savedir, is_finetune=is_finetune, from_pretrained=from_pretrained) model.to(device) num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print("The number of model_parameters: {}".format(num_params)) weight_decay = 0.1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=0.00025, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) doTraining(model, train_dataset, tokenizer, optimizer, scheduler, tr_loss, logging_loss, gradient_accumulation_steps, mlm_probability, device, local_rank, train_batch_size, num_epoch=num_epoch, start_iters=resume_iters, max_grad_norm=max_grad_norm, fp16=fp16, logging_steps=logging_steps, save_dir=model_dir + trained_model_savedir, train_model_name=train_model_name)
def main(corpus_dir, corpus_name, model_dir, trained_model_savedir, create_tokenizer=False, train_model_name='gpt2', train_spm=True, save_tokenized=False, dotraining=False, model_name=None, resume=False, vocab_name='vocab', resume_iters=0, spm_vocab_size=2000, spm_max_sentence_length=4098, spm_model_name='spm_id', block_size=512, spm_model_type='unigram', train_batch_size=1, num_epoch=1000, fp16=False): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--corpus_dir", default=None, type=str, required=True, help="The directory where the corpus located.") parser.add_argument("--corpus_name", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument( "--model_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--trained_model_savedir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--create_tokenizer", action='store_true', help= "Whether to create tokenizer object or not, default is False, means you already have tokenizer trained model." ) parser.add_argument( "--save_tokenized", action='store_true', help= "if --create_tokenizer is True, whether you want to save the model or not, default is False" ) parser.add_argument( "--train_spm", action='store_true', help= "Whether to create sentencepiece object or not, default is False, means you already have sentencepiece trained model." ) parser.add_argument("--dotraining", action='store_true', help="Whether to execute the training gpt2 or not") parser.add_argument( "--resume", action='store_true', help="Whether to resume training from selected epoch or not.") parser.add_argument( "--resume_iters", default=0, type=int, help="Specify what is choosen starting epoch for --resume training.") parser.add_argument("--train_model_name", default="gpt2", type=str, help="Name of trained model when saving/dump.") parser.add_argument("--model_name", default=None, type=str, help="Specify model name to use on --resume training") parser.add_argument( "--vocab_name", default="vocab", type=str, help="Specify vocab model name from process --create_tokenizer") parser.add_argument("--spm_vocab_size", default=2000, type=int, help="Specify sentencepiece vocab size") parser.add_argument( "--spm_max_sentence_length", default=4098, type=int, help= "Specify sentencepiece max_sentence_length that use for training. Only used when firttime training the spm model" ) parser.add_argument("--spm_model_name", default="spm_id", type=str, help="Specify spm model name for saving ") parser.add_argument( "--spm_model_type", default="unigram", type=str, help= "specify method that u want to use as sub-word method in training spm object. Available is: unigram, word and byte-pair-encoding(bpe)" ) parser.add_argument( "--block_size", default=512, type=int, help="Specify block_size for GPT2 configuration networks") parser.add_argument("--train_batch_size", default=2, type=int, help="batch size used during pre-training") parser.add_argument("--num_epoch", default=1000, type=int, help="number opoch to iterate for training") parser.add_argument("--fp16", action='store_true', help="Wheter to use apex for training or not!") args = parser.parse_args() ################################################################################### # set torch device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device("cpu") n_gpu = torch.cuda.device_count() set_seed(seed=1332, n_gpu=n_gpu) num_epoch = num_epoch max_grad_norm = 1.0 gradient_accumulation_steps = 50 warmup_steps = 500 tr_loss, logging_loss = 0.0, 0.0 logging_steps = 50 max_steps = 1000 mlm_probability = 0.15 local_rank = -1 train_batch_size = train_batch_size block_size = block_size ## loading tokenizer tokenizer = TokenizerId(spm_vocab_size=spm_vocab_size) ## prepare dataset _dataset = corpus_dir + corpus_name if create_tokenizer: data_list = ['<unk>', '<sep>', '<cls>', '<mask>'] with open(_dataset, encoding="utf-8") as fp: line = fp.readline() while line: line = fp.readline() data_list.append(line) tokenizer.createVocab(data_list, spm_text_file=_dataset, data_dir=model_dir, train_spm=train_spm, spm_max_sentence_length=spm_max_sentence_length, spm_model_name=spm_model_name, spm_model_type=spm_model_type) else: tokenizer.from_pretrained( model_dir, use_spm=train_spm, spm_model_name=spm_model_name, spm_max_sentence_length=spm_max_sentence_length, std_vocab_name=vocab_name) print("tokenizer.vocab_size: {}".format(tokenizer.vocab_size)) ## saving tokenized object for consistent use if save_tokenized: tokenizer.save_pretrained(model_dir, vocab_name=vocab_name) ## create cache of training dataset train_dataset = loadAndCacheExamples(_dataset, block_size, tokenizer, evaluate=False, use_spm=train_spm) if dotraining: dataset = train_dataset print("Loading train_dataset done...") if max_steps > 0: t_total = max_steps else: t_total = len(dataset) // gradient_accumulation_steps * num_epoch print("t_total: {}".format(t_total)) config = GPT2Config( vocab_size_or_config_json_file=tokenizer.vocab_size) # prepare output_attentions and hidden_states config.output_hidden_states = True model = GPT2Model(config) ## resume iters: if resume: model = restoreModel(model, resume_iters=resume_iters, model_name=model_name, model_save_dir=model_dir + trained_model_savedir, base_model_prefix='gpt2') model.to(device) num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print("The number of model_parameters: {}".format(num_params)) weight_decay = 0.1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=0.00025, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) doTraining(model, config, train_dataset, tokenizer, optimizer, scheduler, tr_loss, logging_loss, gradient_accumulation_steps, mlm_probability, device, local_rank, train_batch_size, n_gpu=n_gpu, num_epoch=num_epoch, start_iters=resume_iters, max_grad_norm=max_grad_norm, fp16=fp16, logging_steps=logging_steps, save_dir=model_dir + trained_model_savedir, train_model_name=train_model_name)
num_labels=num_labels, hidden_size=600, num_attention_heads=12, intermediate_size=2048) #model = Ner(config) model = BertForTokenClassification(config) if eval_only: model_name = model_eval_name from_pretrained = False print("Loading evaluation only with model name: {}".format(model_name)) model = restoreModel(model, resume_iters=start_iters, model_name=model_name, model_save_dir=model_dir + trained_model_savedir, base_model_prefix='bert', from_pretrained=from_pretrained, is_finetune=is_finetune) model.to(device) num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print("The number of model_parameters: {}".format(num_params)) weight_decay = 0.1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{