# neg_train_data_loader = DataLoader( # neg_train_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) # test_data_loader = DataLoader( # test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) # \ # neg_test_data_loader = DataLoader( # neg_test_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) # \ # labeled_data_loader = DataLoader( # labeled_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) # if test_dataset is not None else None # assert False # dataset is not None else None # assert False print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") # trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, # lr=args.lr, betas=( # args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, # with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index) temp_data_loader = DataLoader(temp_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) trainer = ReconstructionBERTTrainer(bert, len(vocab), len(markdown_vocab), args.markdown_emb_size,
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument( "--multi_segment", type=bool, default=False, help="whether use multiple segment_labels for entity types") parser.add_argument("--sep_label", type=bool, default=False, help="whether to insert <sep>") args = parser.parse_args() print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory, multi_segment=args.multi_segment, sep=args.sep_label) print("Loading Test Dataset", args.test_dataset) test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory, multi_segment=args.multi_segment, sep=args.sep_label) \ if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq) print("Training Start") for epoch in range(args.epochs): trainer.train(epoch) trainer.save(epoch, args.output_path) if test_data_loader is not None: trainer.test(epoch)
format = 'csv', skip_header = True, fields = rev_field ) novel = TabularDataset( path = data_novelty_csv_path, format = 'csv', skip_header = True, fields = nov_field ) review_iter = Iterator(review, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) novel_iter = Iterator(novel, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) model = BERT(feature_len).to(device) print("Computing deep features...") review_features = [] for x in tqdm(review_iter): text = x.comment_text.type(torch.LongTensor) text = text.to(device) feature = model(text) review_features.append(feature.detach().cpu().numpy()) review_features = np.vstack(review_features) print(review_features.shape) novel_features = [] for x in tqdm(novel_iter): text = x.novel.type(torch.LongTensor)
def get_models(): from model import BERT return {'BERT': BERT()}
def main(args): assert torch.cuda.is_available(), "need to use GPUs" use_cuda = torch.cuda.is_available() cuda_devices = list(map(int, args.cuda_devices.split(","))) is_multigpu = len(cuda_devices) > 1 device = "cuda" random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed(args.seed) if is_multigpu > 1: torch.cuda.manual_seed_all(args.seed) data = torch.load(args.data) dataset = BERTDataSet(data['word'], data['max_len'], data["dict"], args.batch_size * args.steps) training_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_cpus) model = BERT(dataset.word_size, data["max_len"], args.n_stack_layers, args.d_model, args.d_ff, args.n_head, args.dropout) print( f"BERT have {sum(x.numel() for x in model.parameters())} paramerters in total" ) optimizer = ScheduledOptim( torch.nn.DataParallel( torch.optim.Adam(model.get_trainable_parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-09, weight_decay=0.01), device_ids=cuda_devices), args.d_model, args.n_warmup_steps) w_criterion = WordCrossEntropy() w_criterion = w_criterion.to(device) s_criterion = torch.nn.CrossEntropyLoss() model = model.to(device) model = torch.nn.DataParallel(model, device_ids=cuda_devices) model.train() for step, datas in enumerate(training_data): inp, pos, sent_label, word_label, segment_label = list( map(lambda x: x.to(device), datas)) sent_label = sent_label.view(-1) optimizer.zero_grad() word, sent = model(inp, pos, segment_label) w_loss, w_corrects, tgt_sum = w_criterion(word, word_label) s_loss = s_criterion(sent, sent_label) if is_multigpu: w_loss, s_loss = w_loss.mean(), s_loss.mean() loss = w_loss + s_loss loss.backward() optimizer.step() s_corrects = (torch.max(sent, 1)[1].data == sent_label.data).sum() print( f"[Step {step+1}/{args.steps}] [word_loss: {w_loss:.5f}, sent_loss: {s_loss:.5f}, loss: {loss:.5f}, w_pre: {w_corrects/tgt_sum*100:.2f}% {w_corrects}/{tgt_sum}, s_pre: {float(s_corrects)/args.batch_size*100:.2f}% {s_corrects}/{args.batch_size}]" ) if tf is not None: add_summary_value("Word loss", w_loss, step) add_summary_value("Sent loss", s_loss, step) add_summary_value("Loss", loss, step) add_summary_value("Word predict", w_corrects / tgt_sum, step) add_summary_value("Sent predict", float(s_corrects) / args.batch_size, step) tf_summary_writer.flush()
preds = output.argmax(dim=1) for j in range(len(preds)): total += 1 if preds[j] == target[j]: total_correct += 1 return total_correct / total if __name__ == '__main__': mnli = BERTMNLI(TRAIN_DATA_DIR, bert_type=BERT_TYPE) match = BERTMNLI(MATCH_DATA_DIR, bert_type=BERT_TYPE) mismatch = BERTMNLI(MISMATCH_DATA_DIR, bert_type=BERT_TYPE) checkpoint = torch.load('storage/bert-base-msrp.pt') model = BERT(bert_type=BERT_TYPE) model.load_state_dict(checkpoint['model_state_dict']) model.to(device) ### optimizer = Adam(model.parameters(), lr=LEARNING_RATE) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) criterion = nn.CrossEntropyLoss() best_acc = 0 for epoch in range(1, NUM_EPOCHS + 1): train_loss = train(mnli, model, criterion, optimizer, device) match_acc = eval(match, model, device) mismatch_acc = eval(mismatch, model, device)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=512, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=8, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=500, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument("-restore_model_path", "--restore_model_path", type=str, default=None, help="trained model path") parser.add_argument("-restart_epoch", "--restart_epoch", type=int, default=0, help="restart epoch") args = parser.parse_args() print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading Test Dataset", args.test_dataset) test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \ if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") #import pdb;pdb.set_trace() trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, restore_model_path=args.restore_model_path) print("Training Start") loss_epoch_train = {} loss_epoch_test = {} best_loss = 1e10 for epoch in range(args.epochs + args.restart_epoch): loss_train = trainer.train(epoch) loss_epoch_train[epoch] = loss_train if loss_train < best_loss: best_loss = loss_train trainer.save(epoch, args.output_path) if test_data_loader is not None: loss_test = trainer.test(epoch) loss_epoch_test[epoch] = loss_test print('to get embedings use bert_pytorch/trainer/extract_embeddings.py') #trainer.extract(0) print(loss_epoch_train, loss_epoch_test)
np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) bert_model = BertModel.from_pretrained("bert-base-uncased") # tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1', do_lower_case=True) # bert_model = AlbertModel.from_pretrained("albert-base-v1") model = BERT(2, bert_model) model = model.to(device) train_dataloader, validation_dataloader, test_dataloader = get_baseline_dataloader( args.data_file, args.batch_size, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr) total_steps = len(train_dataloader) * args.epochs if new_version: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(0.1 * total_steps), #warmup_steps = 0, # Default value in run_glue.py num_training_steps=total_steps) #t_total = total_steps) else: