def train_model(cuda, vocab_file, data_pkls, save_pretrain_file): config = cfg.Config.load("config.json") vocab = global_data.load_vocab(vocab_file) config.device = torch.device(cuda if torch.cuda.is_available() else "cpu") config.n_vocab = len(vocab) config.i_pad = global_data.PAD_ID config.n_batch = 24 config.n_epoch = 3 print(config) offset = 0 model = albert_model.AlBertPretrain(config) if os.path.isfile(save_pretrain_file): offset = model.bert.load(save_pretrain_file) + 1 print(">>>> load state dict from: ", save_pretrain_file) model.to(config.device) train_loader = None loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1) optimizer = None scheduler = None for step in trange(config.n_epoch, desc="Epoch"): epoch = step + offset if train_loader is not None: del train_loader data_pkl = data_pkls[epoch % len(data_pkls)] print(f"load pretrain data from {data_pkl}") train_loader = data.build_pretrain_loader(data_pkl, vocab, config.n_batch) if optimizer is None or scheduler is None: t_total = len(train_loader) * config.n_epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': config.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.RAdam(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.WarmupLinearSchedule( optimizer, warmup_steps=config.warmup_steps, t_total=t_total) train_epoch(config, epoch, model, loss_fn, optimizer, scheduler, train_loader) model.bert.save(epoch, save_pretrain_file)
def train_model(cuda, vocab_file, data_pkl, save_pretrain_file): config = cfg.Config.load("config.json") vocab = global_data.load_vocab(vocab_file) token_ids = data.load_pretrain(data_pkl) config.device = torch.device(cuda if torch.cuda.is_available() else "cpu") config.n_vocab = len(vocab) config.n_enc_vocab = len(vocab) config.n_dec_vocab = len(vocab) config.i_pad = global_data.PAD_ID config.n_batch = 64 config.n_epoch = 3 print(config) config.device = torch.device("cpu") offset = 0 model = txl_model.TXLPretrain(config) if os.path.isfile(save_pretrain_file): offset = model.decoder.load(save_pretrain_file) + 1 print(">>>> load state dict from: ", save_pretrain_file) model.to(config.device) train_iter = data.TXLIterator(config, token_ids) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1) t_total = len(train_iter) * config.n_epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': config.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.WarmupLinearSchedule(optimizer, warmup_steps=config.warmup_steps, t_total=t_total) for step in trange(config.n_epoch, desc="Epoch"): epoch = step + offset train_epoch(config, epoch, model, loss_fn, optimizer, scheduler, train_iter) model.decoder.save(epoch, save_pretrain_file)
def demp_pretrain(vocab_file, corpus, file): args = cfg.Config({ "max_seq_len": 512, "short_seq_prob": 0.1, "masked_lm_prob": 0.15, "max_predictions_per_seq": 20, }) vocab = global_data.load_vocab(vocab_file) print(f"read {corpus}, write {file}") docs = [] with open(corpus) as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": if doc: docs.append(doc) doc = [] else: tokens = vocab.encode_as_ids(line.lower()) if tokens: doc.append(tokens) if doc: docs.append(doc) if len(docs) <= 1: exit("ERROR: more documnet need") vocab_list = [] for id in range(vocab.get_piece_size()): if not vocab.is_unknown(id): vocab_list.append(id) with open(file, "w") as f: with tqdm(total=len(docs), desc=f"Document") as pbar: for doc_idx in range(len(docs)): timestamp1 = time.time() doc_instances = create_instances_from_document( docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab=vocab, vocab_list=vocab_list) for instance in doc_instances: f.write(json.dumps(instance)) f.write("\n") timestamp2 = time.time() if 60 < (timestamp2 - timestamp1): print( f">>>> {(timestamp2 - timestamp1)}: {len(doc_instances)}" ) pbar.update(1) pbar.set_postfix_str(f"Instances: {len(doc_instances)}")
def train_model(cuda, vocab_file, data_pkl, save_file, save_pretrain_file): config = cfg.Config.load("config.json") vocab = global_data.load_vocab(vocab_file) train_label, train_sentence1, train_sentence2, valid_label, valid_sentence1, valid_sentence2, test_label, test_sentence1, test_sentence2 = global_data.load_snli(data_pkl) # cuda or cpu config.device = torch.device(cuda if torch.cuda.is_available() else "cpu") config.n_vocab = len(vocab) config.i_pad = global_data.PAD_ID print(config) best_epoch, best_loss, best_val, best_test = 0, 0, 0, 0 model = gpt_model.SNLI(config) if os.path.isfile(save_file): model.load(save_file) print(">>>> load state dict from: ", save_file) elif os.path.isfile(save_pretrain_file): epoch = model.decoder.load(save_pretrain_file) print(">>>> load state dict from: ", save_pretrain_file, "epoch:", epoch) model.to(config.device) train_loader = data.build_data_loader(train_label, train_sentence1, train_sentence2, config.n_batch) # train_loader = data.build_data_loader(test_label, test_sentence1, test_sentence2, config.n_batch) ## only for fast test valid_loader = data.build_data_loader(valid_label, valid_sentence1, valid_sentence2, config.n_batch) test_loader = data.build_data_loader(test_label, test_sentence1, test_sentence2, config.n_batch) lm_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=config.i_pad, reduction='mean') snli_loss_fn = torch.nn.CrossEntropyLoss(reduction='mean') t_total = len(train_loader) * config.n_epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optim.RAdam(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.WarmupLinearSchedule(optimizer, warmup_steps=config.warmup_steps, t_total=t_total) best_epoch, best_loss, best_val, best_test = None, None, None, None for epoch in trange(config.n_epoch, desc="Epoch"): score_loss = train_epoch(config, epoch, model, config.lm_coef, lm_loss_fn, snli_loss_fn, optimizer, scheduler, train_loader) score_val = eval_epoch(config, epoch, model, valid_loader, "Valid") score_test = eval_epoch(config, epoch, model, test_loader, "Test") if best_test is None or best_test < score_test: model.save(epoch, score_loss, score_val, score_test, save_file) best_epoch, best_loss, best_val, best_test = epoch, score_loss, score_val, score_test print(f">>>>>>> model saved at {save_file} {best_epoch} {best_loss:.3f} {best_val:.3f} {best_test:.3f}") else: print(f">>>>>>> model not seved under accuracy {best_epoch} {best_loss:.3f} {best_val:.3f} {best_test:.3f}")
def demp_pretrain(vocab_file, file): in_file = f"../data/corpus.book.middle.txt" vocab = global_data.load_vocab(vocab_file) features = _create_data(vocab=vocab, filename=in_file, seq_len=256, reuse_len=128, num_predict=43, mask_alpha=6, mask_beta=1, perm_size=128) with open(file, 'wb') as f: pickle.dump((features), f)
def demp_pretrain(vocab_file, file): in_file = f"../data/corpus.book.large.txt" vocab = global_data.load_vocab(vocab_file) token_ids = [] with open(in_file) as f: for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": pass else: token_ids.extend(vocab.encode_as_ids(line.lower())) token_ids = np.array(token_ids) with open(file, 'wb') as f: pickle.dump((token_ids), f)
def demp_pretrain(vocab_file, file): args = cfg.Config({ "reduce_memory": False, "train_corpus": Path("../data/corpus.book.large.txt"), "output_dir": Path("data"), "max_seq_len": 256, "short_seq_prob": 0.1, "masked_lm_prob": 0.15, "max_predictions_per_seq": 20, "do_whole_word_mask": True, "save_filename": file, }) vocab = global_data.load_vocab(vocab_file) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = vocab.encode_as_pieces(line.lower()) doc.append(tokens) if doc: docs.add_document( doc ) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit( "ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") vocab_list = [] for id in range(vocab.get_piece_size()): if not vocab.is_unknown(id): vocab_list.append(vocab.id_to_piece(id)) create_training_file(docs, vocab_list, args)
torch.zeros([pad_len], dtype=torch.float32) ], dim=0) feature["target_mask"] = torch.reshape(target_mask, [num_predict]) else: feature["target"] = torch.reshape(target, [seq_len]) feature["target_mask"] = torch.reshape(target_mask, [seq_len]) # reshape back to fixed shape # (seq,) feature["seg_id"] = torch.IntTensor(feature["seg_id"]) # (seq, seq) feature["perm_mask"] = torch.reshape(perm_mask, [seq_len, seq_len]) # (seq,) feature["input_k"] = torch.reshape(input_k, [seq_len]) # (seq,) feature["input_q"] = torch.reshape(input_q, [seq_len]) return feature if __name__ == '__main__': vocab = global_data.load_vocab("../data/m_snli_8000.model") _create_data(sp=vocab, input_paths="data.txt", seq_len=512, reuse_len=256, bi_data=False, num_predict=85, mask_alpha=6, mask_beta=1)