def cnn_embedding(args, h, data_file): if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") train_loader = TripletString(h.xt, h.nt, h.train_knn, h.train_dist, K=args.k) model_file = "{}/model.torch".format(data_file) if os.path.isfile(model_file): model = torch.load(model_file) else: start_time = time.time() model = train_epoch(args, train_loader, device) if args.save_model: torch.save(model, model_file) train_time = time.time() - start_time print("# Training time: " + str(train_time)) model.eval() xt = _batch_embed(args, model.embedding_net, h.xt, device) start_time = time.time() xb = _batch_embed(args, model.embedding_net, h.xb, device) embed_time = time.time() - start_time xq = _batch_embed(args, model.embedding_net, h.xq, device) print("# Embedding time: " + str(embed_time)) if args.save_embed: if args.embed_dir != "": args.embed_dir = args.embed_dir + "/" os.makedirs("{}/{}".format(data_file, args.embed_dir), exist_ok=True) np.save("{}/{}embedding_xb".format(data_file, args.embed_dir), xb) np.save("{}/{}embedding_xt".format(data_file, args.embed_dir), xt) np.save("{}/{}embedding_xq".format(data_file, args.embed_dir), xq) if args.recall: test_recall(xb, xq, h.query_knn) return xq, xb, xt
def main(): transforms_args = [ transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ] train_dataset = CoCoDataset( args.coco_path, "training", target_size=args.target_size, transform=transforms.Compose( transforms_args + [RandomErasing(probability=args.p, sh=args.sh, r1=args.r1)])) test_dataset = CoCoDataset(args.coco_path, "validation_wo_occlusion", target_size=args.target_size, transform=transforms.Compose(transforms_args)) train_batch_sampler = TrainBalancedBatchSampler(torch.from_numpy( np.array(train_dataset.all_targets())), K=args.K, P=args.P, n_batches=args.n_batches) test_batch_sampler = TestBalancedBatchSampler(torch.from_numpy( np.array(test_dataset.all_targets())), K=args.K, P=args.P, n_batches=args.n_batches) train_loader = DataLoader(train_dataset, batch_sampler=train_batch_sampler, **kwargs) test_loader = DataLoader(test_dataset, batch_sampler=test_batch_sampler, **kwargs) # init model model, optim_state_dict, init_epoch = load_model( args.backbone, args.snapshot, imagenet_weights=args.imagenet_weights, freeze=args.freeze) print("Resume training from epoch", init_epoch) if cuda: model.cuda() # init optimizer if args.optim == "Adam": optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) elif args.optim == "SGD": optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=args.lr, weight_decay=1e-4) else: raise ValueError("Optimizer is not supported") if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) # define loss function if args.triplet_selector == "hard": selector = HardestNegativeTripletSelector(args.soft_margin) elif args.triplet_selector == "semi": selector = SemihardNegativeTripletSelector(args.soft_margin) elif args.triplet_selector == "random": selector = RandomNegativeTripletSelector(args.soft_margin) else: selector = AllTripletSelector() train_loss_fn = TripletLoss(selector, soft_margin=args.soft_margin) test_loss_fn = TripletLoss(AllTripletSelector(), soft_margin=args.soft_margin) # define learning rate scheduler lr_scheduler = LrScheduler(args.epoch_decay_start, args.n_epoch, args.lr) log_file = os.path.join( args.logger_dir, '%s_%s.csv' % (args.backbone, args.triplet_selector)) for epoch in range(init_epoch + 1, args.n_epoch): lr_scheduler.adjust_learning_rate(optimizer, epoch, args.optim) for param_group in optimizer.param_groups: print("LR: ", param_group['lr']) train_loss = train_epoch(model, train_loader, train_loss_fn, optimizer, cuda) if epoch % args.eval_freq == 0: test_loss = test_epoch(model, test_loader, test_loss_fn, cuda) print('Epoch [%d/%d], Train loss: %.4f, Test loss: %.4f' % (epoch, args.n_epoch, train_loss, test_loss)) log = [epoch, train_loss, test_loss] if os.path.isfile(log_file): with open(log_file, mode='a', newline='') as csv_f: writer = csv.writer(csv_f) writer.writerow(log) else: with open(log_file, mode='w', newline='') as csv_f: writer = csv.writer(csv_f) # write header writer.writerow(["epoch", "train_loss", "test_loss"]) writer.writerow(log) if epoch % args.save_freq == 0: torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch }, os.path.join( args.snapshot_path, '%s_%s_%d.pth' % (args.backbone, args.triplet_selector, epoch)))
val_labels =torch.tensor(val_labels).to(torch.int64) train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) val_data = TensorDataset(val_inputs, val_masks, val_labels) val_sampler = RandomSampler(val_data) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) optimizer = AdamW(model.parameters(),lr = config.optimizer.kwargs.lr, eps = 1e-8) criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([1,1]).cuda()) total_steps = len(train_dataloader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps) # min_loss = 1e8 max_f1 = -1 for epoch in range(EPOCHS): train_loss,train_acc,train_f1 = train_epoch(model,train_dataloader,optimizer,criterion,scheduler) val_loss,val_acc,val_f1 = val_epoch(model,val_dataloader,optimizer,criterion,scheduler) print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:3f}, Train F1: {train_f1:3f}, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:3f}, Val. F1: {val_f1:3f}') if val_f1 > max_f1: # torch.save(model.state_dict(), Save_path+'.pkl') model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(saved_dir) tokenizer.save_pretrained(saved_dir) # min_loss = val_loss max_f1 = val_f1 print('model saved!')
def run_training(model, train_dev_data_raw, optimizer, vocab, opt, device): ckpt_path = misc_utils.EXP_DIR + opt.exp_name + "/" if not os.path.exists(ckpt_path): os.mkdir(ckpt_path) elif os.listdir(ckpt_path) and not opt.debug: raise ValueError( "Output directory ({}) already exists and is not empty!".format( ckpt_path)) with open(ckpt_path + "config.json", 'w') as f: json.dump(vars(opt), f) fout_log = open(ckpt_path + "training.log", 'w') tb_writer = SummaryWriter(os.path.join(ckpt_path + "tensorboard")) train_data = TASK_CONFIG[opt.task][1](set_type="train") train_data.load_data(raw_data=train_dev_data_raw["train"], opt=opt, vocab=vocab) train_data_sampler = DataSampler(dataset=train_data, sequential=False, opt=opt, device=device) dev_data = TASK_CONFIG[opt.task][1](set_type="dev") dev_data.load_data(raw_data=train_dev_data_raw["train"], opt=opt, vocab=vocab) dev_data_sampler = DataSampler(dataset=dev_data, sequential=True, opt=opt, device=device) model.eval() with torch.no_grad(): avg_losses, avg_val_ppl, cs_acc, st_acc = valid_epoch( model, dev_data_sampler, opt, device) logging.info("--------------- BEFORE TRAINING ---------------") logging.info( "Validation Loss: {:.3f}\tValidation Perplexity: {:.3f}".format( avg_losses["total"], avg_val_ppl)) if opt.task == "absgen": logging.info("Keyphrase selection accuracy: {:.2f}".format(cs_acc * 100)) fout_log.write( "epoch: -1\ttrain_loss: --\tval_loss: {:.3f}\tval_ppl: {:.3f}" "\tkp_selection_acc: {:.4f}\n".format(avg_losses["total"], avg_val_ppl, cs_acc)) else: logging.info( "Keyphrase selection accuracy: {:.2f}\tSentence type accuracy: {:.2f}" .format(cs_acc * 100, st_acc * 100)) fout_log.write( "epoch: -1\ttrain_loss: --\tval_loss: {:.3f}\tval_ppl: {:.3f}" "\tkp_selection_acc: {:.4f}\tstype_acc: {:.4f}\n".format( avg_losses["total"], avg_val_ppl, cs_acc, st_acc)) fout_log.flush() for n_epoch in range(1, opt.num_train_epochs + 1): logging.info("--------------- STARTING EPOCH %d ---------------" % n_epoch) model.train() avg_train_losses = train_epoch(model, train_data_sampler, opt, optimizer, device) with torch.no_grad(): model.eval() avg_losses, avg_val_ppl, cs_acc, st_acc = valid_epoch( model, dev_data_sampler, opt, device) ckpt_name = ckpt_path + "epoch_%d_train_%.4f_val_%.4f_ppl_%.4f.tar" % \ (n_epoch, avg_train_losses["total"], avg_losses["total"], avg_val_ppl) ckpt_dict = { "embedding": model.word_emb.state_dict(), "encoder": model.encoder.state_dict(), "word_decoder": model.wd_dec.state_dict(), "planning_decoder": model.sp_dec.state_dict(), "optimizer": optimizer.state_dict, "epoch": n_epoch, } torch.save(ckpt_dict, ckpt_name) if opt.task == "absgen": fout_log.write( "epoch: {:3d}\ttrain_loss: {:.3f}\ttrain_kp_sel_loss: {:.3f}" "\tval_loss: {:.3f}\tval_ppl: {:.3f}\tkp_sel_acc: {:.4f}\n". format(n_epoch, avg_train_losses["total"], avg_train_losses["content_selection"], avg_losses["total"], avg_val_ppl, cs_acc)) else: fout_log.write( "epoch: {:3d}\ttrain_loss: {:.3f}\ttrain_sent_type_loss: {:.3f}\ttrain_kp_sel_loss: {:.3f}" "\tval_loss: {:.3f}\tval_ppl: {:.3f}\tkp_sel_acc: {:.4f}\tsent_type_acc: {:.4f}\n" .format(n_epoch, avg_train_losses["total"], avg_train_losses["sentence_type"], avg_train_losses["content_selection"], avg_losses["total"], avg_val_ppl, cs_acc, st_acc)) fout_log.flush() for k in avg_train_losses: tb_writer.add_scalars("%s_loss" % k, { "train": avg_train_losses[k], "valid": avg_losses[k] }, n_epoch) tb_writer.add_scalar("valid_perplexity", avg_val_ppl, n_epoch) tb_writer.add_scalar("learning_rate", optimizer.param_groups[0]['lr'], n_epoch) tb_writer.flush() fout_log.close() tb_writer.close()
logger = slog.Logger() logger.json("__args__", args.__dict__) def logbatch(trainer): entry = trainer.log[-1] entry["lr"] = trainer.last_lr logger.json("batch", trainer.log[-1], step=trainer.total) def schedule(total): epoch = total // 1000000 return args.learning_rate * (0.1**(epoch // args.learning_schedule)) model = eval(f"torchvision.models.{args.model}()").cuda() trainer = trainer.Trainer(model, schedule=schedule) trainer.after_batch = logbatch loader = loaders.make_train_loader(**eval(f"dict({args.loaderargs})")) val_loader = loaders.make_val_loader(**eval(f"dict({args.valloaderargs})")) for epoch in range(args.epochs): trainer.train_epoch(loader) loss, err = trainer.errors(val_loader) print("test", trainer.total, loss, err) logger.add_scalar("val/loss", loss, trainer.total) logger.add_scalar("val/top1", err, trainer.total) logger.save("model", model, trainer.total)
import argument_parser import my_utils import script import time opt = argument_parser.parser() my_utils.plant_seeds(randomized_seed=opt.randomize) import trainer trainer = trainer.Trainer(opt) trainer.build_dataset_train() trainer.build_dataset_test() trainer.build_network() trainer.build_optimizer() trainer.build_losses() trainer.start_train_time = time.time() for epoch in range(opt.nepoch): trainer.train_epoch() trainer.test_epoch() trainer.dump_stats() trainer.save_network() trainer.increment_epoch() trainer.save_new_experiments_results() script.main(opt, trainer.network) #Inference opt.faust = "INTRA" script.main(opt, trainer.network) #Inference
if __name__ == '__main__': config = define_argparser() loader = DataLoader(config.train, config.valid, batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length) model = LM(len(loader.text.vocab), word_vec_dim=config.word_vec_dim, hidden_size=config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, max_length=config.max_length) # Let criterion cannot count PAD as right prediction, because PAD is easy to predict. loss_weight = torch.ones(len(loader.text.vocab)) loss_weight[data_loader.PAD] = 0 criterion = nn.NLLLoss(weight=loss_weight, size_average=False) print(model) print(criterion) if config.gpu_id >= 0: model.cuda(config.gpu_id) criterion.cuda(config.gpu_id) trainer.train_epoch(model, criterion, loader.train_iter, loader.valid_iter, config)
if torch.cuda.is_available(): model = LM(hr_dataset.n_word, embedding_dim=config.embedding_dim, hidden_dim=config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, max_length=config.words_num, rnn_type='LSTM').cuda() else: model = LM(hr_dataset.n_word, embedding_dim=config.embedding_dim, hidden_dim=config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, max_length=config.words_num, rnn_type='LSTM') # Let criterion cannot count EOS as right prediction, because EOS is easy to predict. if torch.cuda.is_available(): loss_weight = torch.ones(hr_dataset.n_word).cuda() else: loss_weight = torch.ones(hr_dataset.n_word) loss_weight[0] = 0 criterion = nn.NLLLoss(weight=loss_weight, size_average=False) print(model) print(criterion) trainer.train_epoch(model, criterion, train_loader, test_loader, config)
def cnn_embedding(args, h, data_file): """ h[DataHandler] """ if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") train_loader = TripletString(h.xt, h.nt, h.train_knn, h.train_dist, K=args.k) # for model save and load, let's use state dict instead model_file = "{}/model.torch".format(data_file) if os.path.isfile(model_file): #model = torch.load(model_file) model = _init_net(args, train_loader, device) model.load_state_dict(torch.load(model_file)) else: start_time = time.time() model = train_epoch(args, train_loader, device) if args.save_model: #torch.save(model, model_file) torch.save(model.state_dict(), model_file) train_time = time.time() - start_time print("# Training time: " + str(train_time)) model.eval() # check if we use bert here char_alphabet = None if args.bert: char_alphabet = h.alphabet xt = _batch_embed(args, model.embedding_net, h.xt, device, char_alphabet=char_alphabet) start_time = time.time() xt = [] xb = _batch_embed(args, model.embedding_net, h.xb, device, char_alphabet=char_alphabet) embed_time = time.time() - start_time xq = _batch_embed(args, model.embedding_net, h.xq, device, char_alphabet=char_alphabet) print("# Embedding time: " + str(embed_time)) if args.save_embed: if args.embed_dir != "": args.embed_dir = args.embed_dir + "/" os.makedirs("{}/{}".format(data_file, args.embed_dir), exist_ok=True) np.save("{}/{}embedding_xb".format(data_file, args.embed_dir), xb) np.save("{}/{}embedding_xt".format(data_file, args.embed_dir), xt) np.save("{}/{}embedding_xq".format(data_file, args.embed_dir), xq) if args.recall: test_recall(xb, xq, h.query_knn) return xq, xb, xt
losses = [] trainer = Trainer(device=device, model=Cancer_model, train_loader=CancerDataLoader, val_loader=CancerDataLoader_val, optimizer=optim, loss_fcn=loss_fcn) big_train_loss_list = [] big_val_loss_list = [] #loading a model from saved state dictionary # loaded_model = get_model(device=device) # loaded_model.load_state_dict(torch.load(save_model_path)) # loaded_model.eval() #to plot the losses plot = True for ep in tqdm(range(NUM_EPOCHS), desc='Epochs'): #train one epoch train_loss_list = trainer.train_epoch(save_model=True) val_loss_list = trainer.validate(sample_size=20) big_train_loss_list += train_loss_list big_val_loss_list += val_loss_list if plot: plot_losses(train_loss_list=big_train_loss_list, val_loss_list=big_val_loss_list)
import argument_parser import my_utils import script import time opt = argument_parser.parser() my_utils.plant_seeds(randomized_seed=opt.randomize) import trainer trainer = trainer.Trainer(opt) trainer.build_dataset_train() trainer.build_dataset_test() trainer.build_network() trainer.build_optimizer() trainer.build_losses() trainer.start_train_time = time.time() for epoch in range(opt.nepoch): trainer.train_epoch(epoch) trainer.test_epoch() trainer.dump_stats() trainer.save_network() trainer.increment_epoch() trainer.save_new_experiments_results() script.main(opt, trainer.network) #Inference opt.faust = "INTRA" script.main(opt, trainer.network) #Inference