def run_evaluation_bert(args, checkpoint, test_loader, vocab_size): device = args.device model = BERT().to(device) # model = nn.DataParallel(model) model.load_state_dict(checkpoint['model_state_dict']) model.eval() answer_file = open(args.result_path+'/answer.txt', "w") # For ensemble logit_file = open(args.result_path + '/logit.txt', "w") for i, batch in enumerate(test_loader): text, context = batch.text, batch.context text = text.type(torch.LongTensor).to(device) output = model.run_eval(text) pred = torch.argmax(output, 1).tolist() assert len(pred) == 1 if pred[0] == 1: label = 'SARCASM' elif pred[0] == 0: label = 'NOT_SARCASM' else: raise NotImplementedError("Strange pred.") answer_file.write("twitter_{},{}".format(i+1, label)) answer_file.write('\n') logit_file.write("{},{}".format(output[0][0], output[0][1])) logit_file.write("\n") answer_file.close() logit_file.close()
def __init__(self, lsz, args): super().__init__() self.bert = BERT(args) self.sent_predict = nn.Linear(args.d_model, lsz) self.sent_predict.weight.data.normal_(INIT_RANGE) self.sent_predict.bias.data.zero_()
def __init__(self, lsz, args): super().__init__() self.bert = BERT(args) self.sent_predict = nn.Linear(args.d_model, lsz) self.word_predict = nn.Linear(args.d_model, args.vsz) self.reset_parameters()
def main(args): train_loader, test_loader = load_data(args) if not os.path.isdir('checkpoints'): os.mkdir('checkpoints') args.vocab_len = len(args.vocab['stoi'].keys()) model = BERT(args.vocab_len, args.max_len, args.heads, args.embedding_dim, args.N) if args.cuda: model = model.cuda() if args.task: print('Start Down Stream Task') args.epochs = 3 args.lr = 3e-5 state_dict = torch.load(args.checkpoints) model.load_state_dict(state_dict['model_state_dict']) criterion = {'mlm': None, 'nsp': nn.CrossEntropyLoss()} optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) for epoch in range(1, args.epochs + 1): train_mlm_loss, train_nsp_loss, train_loss, train_mlm_acc, train_nsp_acc = _train( epoch, train_loader, model, optimizer, criterion, args) test_mlm_loss, test_nsp_loss, test_loss, test_mlm_acc, test_nsp_acc = _eval( epoch, test_loader, model, criterion, args) save_checkpoint(model, optimizer, args, epoch) else: print('Start Pre-training') criterion = { 'mlm': nn.CrossEntropyLoss(ignore_index=0), 'nsp': nn.CrossEntropyLoss() } optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) for epoch in range(1, args.epochs): train_mlm_loss, train_nsp_loss, train_loss, train_mlm_acc, train_nsp_acc = _train( epoch, train_loader, model, optimizer, criterion, args) test_mlm_loss, test_nsp_loss, test_loss, test_mlm_acc, test_nsp_acc = _eval( epoch, test_loader, model, criterion, args) save_checkpoint(model, optimizer, args, epoch)
def main(): random.seed(rdn_seed) np.random.seed(rdn_seed) torch.manual_seed(rdn_seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device: {}, n_gpu: {}".format(device, n_gpu)) if device == "cuda": torch.cuda.manual_seed_all(rdn_seed) tokenizer = BehaviorsBERTTokenizer(vocab_file) print("Vocab size:", tokenizer.vocab_size) train_dataset = BERTDataset(corpus_path, tokenizer, max_seq_len, corpus_lines=corpus_lines) batch_size = per_gpu_batch_size * n_gpu train_dataloader = DataLoader(train_dataset, batch_size=batch_size) bert = BERT(vocab_size=tokenizer.vocab_size, hidden=hidden, n_layers=layers, attn_heads=attn_heads, max_seq_len=max_seq_len) trainer = BERTTrainer(bert, tokenizer.vocab_size, epochs, tensorboard_log_dir=tensorboard_log_dir, output_path=output_path, train_dataloader=train_dataloader, with_cuda=torch.cuda.is_available(), log_freq=100, save_steps=100000) trainer.train()
def run_training_bert(args, dataset, train_loader, val_loader, vocab_size): checkpoint_path = os.path.join(args.checkpoint_path, args.checkpoint) device = torch.device("cuda:" + args.device if torch.cuda.is_available() else "cpu") model = BERT().to(device) # Initialize BCELoss function # criterion = nn.BCEWithLogitsLoss() # Setup Adam optimizers for both G and D optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) model.train() # turn on training mode # Training Loop print("Starting Training Loop...") # For each epoch for epoch in range(args.epochs): # For each batch in the dataloader losses = [] running_corrects = 0 for i, batch in enumerate(train_loader): # format batch text, context, label = batch.text, batch.context, batch.label # print(text.tolist()[0]) # print(label.tolist()[0]) label = label.type(torch.LongTensor).to(device) text = text.type(torch.LongTensor).to(device) output = model(text, label) loss, _ = output optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) epoch_loss = sum(losses) / len(losses) print('Epoch: {}, Training Loss: {:.4f}'.format(epoch, epoch_loss)) # save model if epoch % 1 == 0 or epoch == args.epochs - 1: torch.save( { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'vocab_size': vocab_size, 'args': vars(args) }, checkpoint_path) if args.eval: model.eval() with torch.no_grad(): preds = [] labels = [] eval_losses = [] for i, batch in enumerate(val_loader if val_loader is not None else train_loader): text, context, label = batch.text, batch.context, batch.label label = label.type(torch.LongTensor).to(device) text = text.type(torch.LongTensor).to(device) output = model(text, label) loss, output = output pred = torch.argmax(output, 1).tolist() preds.extend(pred) labels.extend(label.tolist()) eval_losses.append(loss.item()) print("{} Precision: {}, Recall: {}, F1: {}, Loss: {}". format( "Train" if val_loader is None else "Valid", sklearn.metrics.precision_score( np.array(labels).astype('int32'), np.array(preds)), sklearn.metrics.recall_score( np.array(labels).astype('int32'), np.array(preds)), sklearn.metrics.f1_score( np.array(labels).astype('int32'), np.array(preds)), np.average(eval_losses)))
seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) print("Reading Word Vectors") weights_matrix = ReadWordVec(args.emb_path, args.emb_filename, args.emb_dim) print("Building Model") bert = BERT(len(vocab), weights_matrix, hidden=args.emb_dim, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating Trainer") trainer = BERTTrainer(bert, len(vocab), args.seq_len, train_dataloader=train_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") args = parser.parse_args() print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading Test Dataset", args.test_dataset) test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \ if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq) print("Training Start") for epoch in range(args.epochs): print("eee") trainer.train(epoch) trainer.save(epoch, args.output_path) if test_data_loader is not None: trainer.test(epoch)
# TODO: Load dataset train_dataset = MyDataset(args.train_file) word2vec = train_dataset.symbol2id test_dataset = MyDataset(args.test_file, vocab=(train_dataset.symbol2id, train_dataset.id2sybmol)) test_loader = DataLoader(test_dataset, batch_size=hyperparams['batch_size']) num_tokens = len(train_dataset.id2sybmol) print('num tokens', num_tokens) print('size', test_dataset.seq.size()) model = BERT(num_tokens).to(device) if args.load: # print("Model's state_dict:") # for param_tensor in torch.load('./model.pt', map_location=torch.device(device)): # print(param_tensor, "\t", model.state_dict()[param_tensor].size()) model.load_state_dict( torch.load('./model.pt', map_location=torch.device(device))) if args.train: train(model, train_dataset, experiment, hyperparams) if args.save: torch.save(model.state_dict(), './model.pt') if args.test: test(model, test_loader, experiment, hyperparams) if args.analysis: embedding_analysis(model, experiment, train_dataset, test_dataset)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--dataset", required=True, type=str, help="dataset") # parser.add_argument("-c", "--train_dataset", required=True, # type=str, help="train dataset for train bert") # parser.add_argument("-t", "--test_dataset", type=str, # default=None, help="test set for evaluate train set") # parser.add_argument("-v", "--vocab_path", required=True, # type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=64, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--duplicate", type=int, default=5, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument("--dropout", type=float, default=0.2, help="dropout value") args = parser.parse_args() print("Load Data", args.dataset) data_reader = DataReader(args.dataset, seq_len=args.seq_len) neg_data_reader = DataReader(args.dataset, graphs=data_reader.graphs, shuffle=True, duplicate=args.duplicate, seq_len=args.seq_len) # print("Loading Vocab", args.vocab_path) print("Loading Vocab") vocab = Vocab(data_reader.graphs) # vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Shuffle Data") 'TODO' print("Loading Train Dataset", args.dataset) train_dataset = CustomBERTDataset( data_reader.graphs[:int(len(data_reader) * 0.8)], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # pdb.set_trace() neg_train_dataset = CustomBERTDataset( neg_data_reader.graphs[:args.duplicate * len(train_dataset)], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # pdb.set_trace() assert len(neg_train_dataset) == args.duplicate * len(train_dataset) # print("Loading Test Dataset", args.test_dataset) print("Loading Dev Dataset", args.dataset) test_dataset = CustomBERTDataset( data_reader.graphs[int(len(data_reader) * 0.8):], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # \ neg_test_dataset = CustomBERTDataset( neg_data_reader.graphs[-args.duplicate * len(test_dataset):], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate) # \ assert len(neg_test_dataset) == args.duplicate * len(test_dataset) # if args.test_dataset is not None else None # pdb.set_trace() print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) neg_train_data_loader = DataLoader(neg_train_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) # \ neg_test_data_loader = DataLoader(neg_test_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) # \ # if test_dataset is not None else None # assert False print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads, dropout=args.dropout) print("Creating BERT Trainer") # trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, # lr=args.lr, betas=( # args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, # with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index) trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index) # raise NotImplementedError print("Training Start") best_loss = None for epoch in range(args.epochs): # test_loss = trainer.test(epoch) train_loss = trainer.train(epoch) torch.cuda.empty_cache() # if test_data_loader is not None: test_loss = trainer.test(epoch) if best_loss is None or test_loss < best_loss: best_loss = test_loss trainer.save(epoch, args.output_path) torch.cuda.empty_cache()
preds = output.argmax(dim=1) for j in range(len(preds)): total += 1 if preds[j] == target[j]: total_correct += 1 return total_correct/total if __name__ == '__main__': mnli = BERTMNLI(TRAIN_DATA_DIR, bert_type=BERT_TYPE) match = BERTMNLI(MATCH_DATA_DIR, bert_type=BERT_TYPE) mismatch = BERTMNLI(MISMATCH_DATA_DIR, bert_type=BERT_TYPE) checkpoint = torch.load('storage/bert-base-dnli.pt') model = BERT(bert_type=BERT_TYPE) model.load_state_dict(checkpoint['model_state_dict']) model.to(device) ### optimizer = Adam(model.parameters(), lr = LEARNING_RATE) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) criterion = nn.CrossEntropyLoss() best_acc = 0 for epoch in range(1, NUM_EPOCHS+1): train_loss = train(mnli, model, criterion, optimizer, device) match_acc = eval(match, model, device) mismatch_acc= eval(mismatch, model, device)
word_list = list(set(" ".join(sentences).split())) word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3} # 建立词表 for i, w in enumerate(word_list): word_dict[w] = i + 4 number_dict = {i: w for i, w in enumerate(word_dict)} vocab_size = len(word_dict) # 将句子转为对应的id序列 token_list = list() for sentence in sentences: arr = [word_dict[s] for s in sentence.split()] token_list.append(arr) model = BERT() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) batch = make_batch() input_ids, segment_ids, masked_tokens, masked_pos, isNext = map( torch.LongTensor, zip(*batch)) for epoch in range(100): optimizer.zero_grad() logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos) loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM loss_lm = (loss_lm.float()).mean() loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) bert_model = BertModel.from_pretrained("bert-base-uncased") # tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1', do_lower_case=True) # bert_model = AlbertModel.from_pretrained("albert-base-v1") model = BERT(2, bert_model) model = model.to(device) train_dataloader, validation_dataloader, test_dataloader = get_baseline_dataloader( args.data_file, args.batch_size, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr) total_steps = len(train_dataloader) * args.epochs if new_version: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(0.1 * total_steps), #warmup_steps = 0, # Default value in run_glue.py num_training_steps=total_steps) #t_total = total_steps) else:
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--dataset", required=True, type=str, help="dataset") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-t", "--test_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-me", "--markdown_emb_size", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=64, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--duplicate", type=int, default=5, help="dataloader worker size") parser.add_argument("--model_path", type=str, help="ex)output/bert.model") parser.add_argument("--hinge_loss_start_point", type=int, default=20) parser.add_argument("--entropy_start_point", type=int, default=30) parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=str, default='0', help="CUDA device ids") parser.add_argument("--max_graph_num", type=int, default=3000000, help="printing loss every n iter: setting n") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--n_topics", type=int, default=50) parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument("--dropout", type=float, default=0.2, help="dropout value") parser.add_argument("--weak_supervise", action="store_true") parser.add_argument( "--neighbor", action="store_true", help="force topic distribution over neighbor nodes to be close") parser.add_argument("--min_occur", type=int, default=3, help="minimum of occurrence") parser.add_argument("--use_sub_token", action="store_true") parser.add_argument("--context", action="store_true", help="use information from neighbor cells") parser.add_argument("--markdown", action="store_true", help="use markdown") args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices print("Load Data", args.dataset) data_reader = DataReader(args.dataset, use_sub_token=args.use_sub_token, max_graph_num=args.max_graph_num) labeled_data_reader = DataReader(args.test_path, use_sub_token=args.use_sub_token) print("Loading Vocab") if args.markdown: vocab = UnitedVocab(data_reader.graphs, min_occur=args.min_occur, use_sub_token=args.use_sub_token) else: vocab = SNAPVocab(data_reader.graphs, min_occur=args.min_occur, use_sub_token=args.use_sub_token) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.dataset) train_dataset = SNAPDataset( data_reader.graphs[:int(len(data_reader) * 0.8)], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate, use_sub_token=args.use_sub_token, n_topics=args.n_topics, markdown=args.markdown) print(len(train_dataset)) # print("Loading Test Dataset", args.test_dataset) print("Loading Dev Dataset", args.dataset) test_dataset = SNAPDataset(data_reader.graphs[int(len(data_reader) * 0.8):], vocab, seq_len=args.seq_len, on_memory=args.on_memory, n_neg=args.duplicate, use_sub_token=args.use_sub_token, n_topics=args.n_topics, markdown=args.markdown) # \ print(len(test_dataset)) labeled_dataset = SNAPDataset(labeled_data_reader.graphs, vocab, seq_len=args.seq_len, on_memory=args.on_memory, use_sub_token=args.use_sub_token, markdown=args.markdown) # if args.test_dataset is not None else None # pdb.set_trace() print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=temp_collate) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=temp_collate) # \ labeled_data_loader = DataLoader(labeled_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=temp_collate) # if test_dataset is not None else None # assert False print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads, dropout=args.dropout) print("Creating BERT Trainer") trainer = TempTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index, model_path=args.model_path, weak_supervise=args.weak_supervise, context=args.context, markdown=args.markdown, hinge_loss_start_point=args.hinge_loss_start_point, entropy_start_point=args.entropy_start_point) # raise NotImplementedError print("Training Start") output_folder = args.output_path.split('.')[0] if not os.path.exists(output_folder): os.mkdir(output_folder) with open(os.path.join(output_folder, './setting.json'), 'w') as fout: json.dump(args.__dict__, fout, ensure_ascii=False, indent=2) # pdb.set_trace() best_loss = None for epoch in range(args.epochs): train_loss = trainer.train(epoch) # if test_data_loader is not None: test_loss = trainer.test(epoch) trainer.save(epoch, os.path.join(output_folder, args.output_path)) stages, stage_vecs = trainer.api(labeled_data_loader) correct = 0 zero_cells = 0 for g in labeled_dataset.graphs: if int(g["stage"]) == 0: zero_cells += 1 # print(zero_cells) for i, g in enumerate(labeled_dataset.graphs): if stages[i] == int(g["stage"]) and int(g["stage"]) != 0: correct += 1 else: pass accuracy = correct / (len(stages) - zero_cells) # print(accuracy) with open(os.path.join(output_folder, './results.txt'), 'a') as fout: json.dump({ "epoch": epoch, "accuracy": accuracy, "loss": test_loss }, fout) fout.write('\n') with open(os.path.join(output_folder, 'graphs_{}.txt'.format(epoch)), 'w') as fout: for i, g in enumerate(labeled_dataset.graphs): g["pred"] = stages[i] g["stage_vec"] = stage_vecs[i] fout.write(json.dumps(g)) fout.write('\n') torch.cuda.empty_cache()
# neg_train_data_loader = DataLoader( # neg_train_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) # test_data_loader = DataLoader( # test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) # \ # neg_test_data_loader = DataLoader( # neg_test_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate) # \ # labeled_data_loader = DataLoader( # labeled_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) # if test_dataset is not None else None # assert False # dataset is not None else None # assert False print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") # trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, # lr=args.lr, betas=( # args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, # with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index) temp_data_loader = DataLoader(temp_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate) trainer = ReconstructionBERTTrainer(bert, len(vocab), len(markdown_vocab), args.markdown_emb_size,
format = 'csv', skip_header = True, fields = rev_field ) novel = TabularDataset( path = data_novelty_csv_path, format = 'csv', skip_header = True, fields = nov_field ) review_iter = Iterator(review, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) novel_iter = Iterator(novel, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) model = BERT(feature_len).to(device) print("Computing deep features...") review_features = [] for x in tqdm(review_iter): text = x.comment_text.type(torch.LongTensor) text = text.to(device) feature = model(text) review_features.append(feature.detach().cpu().numpy()) review_features = np.vstack(review_features) print(review_features.shape) novel_features = [] for x in tqdm(novel_iter): text = x.novel.type(torch.LongTensor)
def get_models(): from model import BERT return {'BERT': BERT()}
def main(args): assert torch.cuda.is_available(), "need to use GPUs" use_cuda = torch.cuda.is_available() cuda_devices = list(map(int, args.cuda_devices.split(","))) is_multigpu = len(cuda_devices) > 1 device = "cuda" random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed(args.seed) if is_multigpu > 1: torch.cuda.manual_seed_all(args.seed) data = torch.load(args.data) dataset = BERTDataSet(data['word'], data['max_len'], data["dict"], args.batch_size * args.steps) training_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_cpus) model = BERT(dataset.word_size, data["max_len"], args.n_stack_layers, args.d_model, args.d_ff, args.n_head, args.dropout) print( f"BERT have {sum(x.numel() for x in model.parameters())} paramerters in total" ) optimizer = ScheduledOptim( torch.nn.DataParallel( torch.optim.Adam(model.get_trainable_parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-09, weight_decay=0.01), device_ids=cuda_devices), args.d_model, args.n_warmup_steps) w_criterion = WordCrossEntropy() w_criterion = w_criterion.to(device) s_criterion = torch.nn.CrossEntropyLoss() model = model.to(device) model = torch.nn.DataParallel(model, device_ids=cuda_devices) model.train() for step, datas in enumerate(training_data): inp, pos, sent_label, word_label, segment_label = list( map(lambda x: x.to(device), datas)) sent_label = sent_label.view(-1) optimizer.zero_grad() word, sent = model(inp, pos, segment_label) w_loss, w_corrects, tgt_sum = w_criterion(word, word_label) s_loss = s_criterion(sent, sent_label) if is_multigpu: w_loss, s_loss = w_loss.mean(), s_loss.mean() loss = w_loss + s_loss loss.backward() optimizer.step() s_corrects = (torch.max(sent, 1)[1].data == sent_label.data).sum() print( f"[Step {step+1}/{args.steps}] [word_loss: {w_loss:.5f}, sent_loss: {s_loss:.5f}, loss: {loss:.5f}, w_pre: {w_corrects/tgt_sum*100:.2f}% {w_corrects}/{tgt_sum}, s_pre: {float(s_corrects)/args.batch_size*100:.2f}% {s_corrects}/{args.batch_size}]" ) if tf is not None: add_summary_value("Word loss", w_loss, step) add_summary_value("Sent loss", s_loss, step) add_summary_value("Loss", loss, step) add_summary_value("Word predict", w_corrects / tgt_sum, step) add_summary_value("Sent predict", float(s_corrects) / args.batch_size, step) tf_summary_writer.flush()