def main(): parser = ArgumentParser() parser.add_argument("--model-config", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--outlens", type=int, default=30) parser.add_argument("--beam", type=int, default=1) parser.add_argument("--checkpoints", type=str) parser.add_argument("--data", type=str, default="file") args = parser.parse_args() args.load_model = True model = BertModel(None, args) state_dict = convert_model(torch.load(args.checkpoints)['sd']) model.load_state_dict(state_dict) model.to(args.device) tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir") generate(model, tokenizer, args.device, args.data, sample=True, top_k=5, beam_size=6, outlens=30)
def model_init(app): ArgsSet = type('ArgsSet',(object,),{}) client = ArgsSet() parser = ArgumentParser() parser.add_argument("--model-config", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--outlens", type=int, default=30) parser.add_argument("--beam", type=int, default=1) parser.add_argument("--gpt-checkpoints", type=str) parser.add_argument("--port", type=int, default=8866) args = parser.parse_args() args.load_model = True args.fp32_embedding = False args.fp32_layernorm = False args.fp32_tokentypes = False args.layernorm_epsilon = 1e-12 gpt = BertModel(None, args) state_dict = convert_model(torch.load(args.gpt_checkpoints)['sd']) gpt.load_state_dict(state_dict) gpt.to(args.device) gpt.eval() tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir") print(" Load model from {}".format(args.gpt_checkpoints)) client.tokenizer = tokenizer client.gpt =gpt client.gpt_beam = SequenceGenerator(gpt, tokenizer, beam_size=args.beam, max_lens=args.outlens) client.device = args.device client.port = args.port client.generator = sample_sequence return client
old_vocab = train_dataset.vocab vocab = torchtext.legacy.vocab.Vocab( counter=old_vocab.freqs, specials=['<unk>', '<pad>', '<MASK>']) with open(args.save_vocab, 'wb') as f: torch.save(vocab, f) pad_id = vocab.stoi['<pad>'] sep_id = vocab.stoi['<sep>'] cls_id = vocab.stoi['<cls>'] train_dataset, dev_dataset = SQuAD1(vocab=vocab) train_dataset = process_raw_data(train_dataset) dev_dataset = process_raw_data(dev_dataset) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") embed_layer = BertEmbedding(len(vocab), args.emsize) pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, embed_layer, args.dropout) pretrained_bert.load_state_dict(torch.load(args.bert_model)) model = QuestionAnswerTask(pretrained_bert).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) best_f1 = None train_loss_log, val_loss_log = [], [] for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train() val_loss, val_exact, val_f1 = evaluate(dev_dataset, vocab) val_loss_log.append(val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'exact {:8.3f}% | '
def main(train_file, dev_file, target_dir, epochs=10, batch_size=32, lr=2e-05, patience=3, max_grad_norm=10.0, checkpoint=None): bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) device = torch.device("cuda") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = DataPrecessForSentence(bert_tokenizer, train_file) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = DataPrecessForSentence(bert_tokenizer, dev_file) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") model = BertModel().to(device) # -------------------- Preparation for training ------------------- # # 待优化的参数 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr) #optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader) print( "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training Bert model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate( model, dev_loader) valid_losses.append(epoch_loss) print( "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
# We use only ~1% data to fine-tune the model. train, dev = SQuAD1() raw_train = list(train)[:1024] raw_dev = list(dev)[:128] convert_to_arrow(raw_train, "train_arrow") convert_to_arrow(raw_dev, "dev_arrow") base_url = 'https://pytorch.s3.amazonaws.com/models/text/torchtext_bert_example/' vocab_path = download_from_url(base_url + 'bert_vocab.txt') data_module = QuestionAnswerDataModule(train_arrow_path='train_arrow', dev_arrow_path='dev_arrow', vocab_filepath=vocab_path, batch_size=BATCH_SIZE) # Load pretrained model and generate task # default parameters from the pretrained model vocab_size, emsize, nhead, nhid, nlayers, dropout = 99230, 768, 12, 3072, 12, 0.2 pretrained_bert = BertModel(vocab_size, emsize, nhead, nhid, nlayers, dropout) pretrained_model_path = download_from_url(base_url + 'ns_bert.pt') pretrained_bert.load_state_dict( torch.load(pretrained_model_path, map_location='cpu')) qa_model = QuestionAnswerModel(pretrained_bert) task = QuestionAnswerTask(qa_model, LR) trainer = Trainer(gpus=0, max_epochs=EPOCH, progress_bar_refresh_rate=40, fast_dev_run=True) trainer.fit(task, data_module)
def run_main(args, rank=None): # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if args.parallel == 'DDP': n = torch.cuda.device_count() // args.world_size device = list(range(rank * n, (rank + 1) * n)) else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab = torch.load(args.save_vocab) cls_id = vocab.stoi['<cls>'] pad_id = vocab.stoi['<pad>'] sep_id = vocab.stoi['<sep>'] if args.dataset == 'WikiText103': from torchtext.experimental.datasets import WikiText103 train_dataset, valid_dataset, test_dataset = WikiText103(vocab=vocab) elif args.dataset == 'BookCorpus': from data import BookCorpus train_dataset, valid_dataset, test_dataset = BookCorpus(vocab, min_sentence_len=60) if rank is not None: chunk_len = len(train_dataset.data) // args.world_size train_dataset.data = train_dataset.data[(rank * chunk_len):((rank + 1) * chunk_len)] if args.checkpoint != 'None': model = torch.load(args.checkpoint) else: pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout) pretrained_bert.load_state_dict(torch.load(args.bert_model)) model = NextSentenceTask(pretrained_bert) if args.parallel == 'DDP': model = model.to(device[0]) model = DDP(model, device_ids=device) else: model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) best_val_loss = None train_loss_log, val_loss_log = [], [] for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train(process_raw_data(train_dataset, args), model, train_loss_log, device, optimizer, criterion, epoch, scheduler, cls_id, sep_id, pad_id, args, rank) val_loss = evaluate(process_raw_data(valid_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) val_loss_log.append(val_loss) if (rank is None) or (rank == 0): print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s ' '| valid loss {:8.5f} | '.format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) if not best_val_loss or val_loss < best_val_loss: if rank is None: with open(args.save, 'wb') as f: torch.save(model, f) elif rank == 0: with open(args.save, 'wb') as f: torch.save(model.state_dict(), f) best_val_loss = val_loss else: scheduler.step() if args.parallel == 'DDP': rank0_devices = [x - rank * len(device) for x in device] device_pairs = zip(rank0_devices, device) map_location = {'cuda:%d' % x: 'cuda:%d' % y for x, y in device_pairs} model.load_state_dict(torch.load(args.save, map_location=map_location)) test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) if rank == 0: wrap_up(train_loss_log, val_loss_log, test_loss, args, model.module, 'ns_loss.txt', 'ns_model.pt') else: with open(args.save, 'rb') as f: model = torch.load(f) test_loss = evaluate(process_raw_data(test_dataset, args), model, device, criterion, cls_id, sep_id, pad_id, args) wrap_up(train_loss_log, val_loss_log, test_loss, args, model, 'ns_loss.txt', 'ns_model.pt')
def model_init(app): ArgsSet = type('ArgsSet', (object, ), {}) client = ArgsSet() parser = ArgumentParser() parser.add_argument("--model-config", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--outlens", type=int, default=30) parser.add_argument("--beam", type=int, default=1) parser.add_argument("--fuse-checkpoints", type=str) parser.add_argument("--gpt-checkpoints", type=str) parser.add_argument("--qa-style-checkpoints", type=str) parser.add_argument("--multi-task", type=str) parser.add_argument("--split-sentence-with-task-embedding-checkpoints", type=str) parser.add_argument("--special-cls-checkpoints", type=str) parser.add_argument("--port", type=int, default=8866) args = parser.parse_args() args.load_model = True args.fp32_embedding = False args.fp32_layernorm = False args.fp32_tokentypes = False args.layernorm_epsilon = 1e-12 fuse_model = BertModel(None, args) state_dict = convert_model(torch.load(args.fuse_checkpoints)['sd']) fuse_model.load_state_dict(state_dict) fuse_model.to(args.device) fuse_model.eval() print("| Load model from {}".format(args.fuse_checkpoints)) gpt = BertModel(None, args) state_dict = convert_model(torch.load(args.gpt_checkpoints)['sd']) gpt.load_state_dict(state_dict) gpt.to(args.device) gpt.eval() tokenizer = BertWordPieceTokenizer("bert-base-chinese", cache_dir="temp_cache_dir") print(" Load model from {}".format(args.gpt_checkpoints)) # Load bert checkpoints args.load_model = False args.fp32_embedding = False args.fp32_layernorm = False args.fp32_tokentypes = False args.layernorm_epsilon = 1e-12 bert = BertModel(None, args) bert.to(args.device) bert.eval() client.tokenizer = tokenizer client.fuse_model = fuse_model client.fuse_beam = SequenceGenerator(fuse_model, tokenizer, beam_size=args.beam, max_lens=args.outlens) client.gpt = gpt client.gpt_beam = SequenceGenerator(gpt, tokenizer, beam_size=args.beam, max_lens=args.outlens) client.bert = bert client.device = args.device client.port = args.port client.generator = sample_sequence # multi task model multi_task = BertModel(None, args) state_dict = convert_model(torch.load(args.multi_task)['sd']) print("| Load model from {}".format(args.multi_task)) multi_task.load_state_dict(state_dict) multi_task.to(args.device) multi_task.eval() client.multi_task_model = multi_task client.multi_task_beam = SequenceGenerator(multi_task, tokenizer, beam_size=args.beam, max_lens=args.outlens) # qa style model qa_style = BertModel(None, args) state_dict = convert_model(torch.load(args.qa_style_checkpoints)['sd']) qa_style.load_state_dict(state_dict) qa_style.to(args.device) qa_style.eval() print(" Load model from {}".format(args.qa_style_checkpoints)) client.qa_task_model = qa_style # special cls tokens special_cls_model = BertModel(None, args) special_cls_model.eval() state_dict = convert_model(torch.load(args.special_cls_checkpoints)['sd']) special_cls_model.load_state_dict(state_dict) special_cls_model.to(args.device) special_cls_model.eval() print(" Load model from {}".format(args.special_cls_checkpoints)) client.special_cls_model = special_cls_model client.special_beam = SequenceGenerator(special_cls_model, tokenizer, beam_size=args.beam, max_lens=args.outlens) # split sentence model with task embedding split_sentence_model = BertModel(None, args) split_sentence_model.eval() state_dict = convert_model( torch.load(args.split_sentence_with_task_embedding_checkpoints)['sd']) split_sentence_model.load_state_dict(state_dict) split_sentence_model.to(args.device) split_sentence_model.eval() print(" Load model from {}".format( args.split_sentence_with_task_embedding_checkpoints)) client.split_sentence_model = split_sentence_model client.split_sentence_beam = SequenceGenerator(split_sentence_model, tokenizer, beam_size=args.beam, max_lens=args.outlens) return client
import re import torch import sentencepiece as spm from model import BertModel sp = spm.SentencePieceProcessor() sp.load('resource/sentencepiece.unigram.35000.model') vocab_size = sp.get_piece_size() n_embedding = 512 n_layer = 8 model = BertModel(vocab_size, n_embedding, n_layer) model.eval() model.load_state_dict(torch.load('resource/model.{}.{}.th'.format(n_embedding, n_layer), map_location='cpu')) # you should enable cuda if it is available # model.cuda() # if you are using a GPU that has tensor cores (nvidia volta, Turing architecture), you can enable half precision # inference and training, we recommend to use the nvidia official apex to make everything as clean as possible from # apex import amp [model] = amp.initialize([model], opt_level="O2") device = model.embedding.weight.data.device def clean_text(txt): txt = txt.lower() txt = re.sub('\s*', '', txt) return txt