def create_trainer(): model = Baseline(bert_vocab_num=24000, emb_dim=300, hidden_dim=256, output_dim=3).to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) criterion = torch.nn.CrossEntropyLoss() # criterion = FocalLoss(num_classes=3) trainer = Trainer(model, optimizer, criterion, NUM_EPOCH, device) return trainer
def run(dataset_train, dataset_dev, dataset_test, model_type, word_embed_size, hidden_size, batch_size, use_cuda, n_epochs): if model_type == 'base': model = Baseline(vocab=dataset_train.vocab, word_embed_size=word_embed_size, hidden_size=hidden_size, use_cuda=use_cuda, inference=False) else: raise NotImplementedError if use_cuda: model = model.cuda() optim_params = model.parameters() optimizer = optim.Adam(optim_params, lr=10**-3) print('start training') for epoch in range(n_epochs): train_loss, tokens, preds, golds = train(dataset_train, model, optimizer, batch_size, epoch, Phase.TRAIN, use_cuda) dev_loss, tokens, preds, golds = train(dataset_dev, model, optimizer, batch_size, epoch, Phase.DEV, use_cuda) logger = '\t'.join([ 'epoch {}'.format(epoch + 1), 'TRAIN Loss: {:.9f}'.format(train_loss), 'DEV Loss: {:.9f}'.format(dev_loss) ]) print('\r' + logger, end='') test_loss, tokens, preds, golds = train(dataset_test, model, optimizer, batch_size, epoch, Phase.TEST, use_cuda) print('====', 'TEST', '=====') print_scores(preds, golds) output_results(tokens, preds, golds)
#%% data, label = load_data(data_path, label_path, 'indian_pines') #%% get_value_data(data, label) #%% DATA = pd.read_csv('datasets/Indian_pines.csv', header=None).values data_D = DATA[:, :-1] data_L = DATA[:, -1] data_train, data_test, label_train, label_test = train_test_split( data_D, data_L, test_size=0.8) #%% train_set = GetLoader(data_train, label_train) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) val_set = GetLoader(data_test, label_test) val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False) #%% data_p, label_p = next(iter(train_loader)) # print(data_p[:-1]) #%% net = Baseline(INPUT_CHANNELS, CLASSES, dropout=False) optimizer = optim.Adam(net.parameters(), lr=0.0001) weight = torch.ones(CLASSES) weight[torch.LongTensor([0])] = 0. w = weight.to(DEVICE) criterion = nn.CrossEntropyLoss(weight=w) #%% train_loss, val_accuracy = train(net, optimizer, criterion, train_loader, val_loader, EPOCH, DEVICE) plot_curve(train_loss) plot_curve(val_accuracy)
num_workers=0, shuffle=True) return train_loader if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: epochs = 100 best_val_loss = 999999 print(device) net = Baseline() net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9) for epoch in range(epochs): with tqdm(total=len(load_dataset(train_path))) as epoch_pbar: epoch_pbar.set_description(f'Epoch {epoch}') running_loss = 0.0 running_val_loss = 0.0 for i, data in enumerate(load_dataset(train_path)): # get the inputs; data is a list of [inputs, labels] inputs = data[0].to(device) labels = data[1].to(device) outputs = net(inputs) loss = criterion(outputs, labels) running_loss += loss.item() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize
report = parser.report(end='<br>') vis.text(report, win='report f{}'.format(FG.cur_fold)) torch.cuda.set_device(FG.devices[0]) device = torch.device(FG.devices[0]) net = Baseline(FG.ckpt_dir, len(FG.labels)) # net = Baseline3D(FG.ckpt_dir, len(FG.labels)) if len(FG.devices) > 1: net = torch.nn.DataParallel(net, device_ids=FG.devices) print(net.module) else: print(net) optimizer = Adam(net.parameters(), lr=FG.lr, weight_decay=FG.l2_decay) scheduler = ExponentialLR(optimizer, gamma=FG.lr_gamma) trainloader, testloader = get_dataloader(k=FG.fold, cur_fold=FG.cur_fold, modality=FG.modality, axis=FG.axis, labels=FG.labels, batch_size=FG.batch_size) trainer = create_supervised_trainer(net, optimizer, F.cross_entropy, device=device, non_blocking=True)
valid_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=collate_fn, sampler=valid_sampler) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, num_workers=4, shuffle=False) config = { "epochs": 100, "device": get_device(), "sampling": True, "temperature": 1.0, "max_sentence_length": 18 } embedding_dim = 256 hidden_dim = 512 vocab_size = len(vocab) model = Baseline(embedding_dim, hidden_dim, vocab_size, vanilla=False) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=5e-4) model.cuda() train(model, optimizer, criterion, train_loader, valid_loader, vocab, config) test(model, criterion, test_loader, vocab, config)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if (args.model == 'baseline'): model = Baseline(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) elif (args.model == 'bidaf'): model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, drop_prob=args.drop_prob) optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) elif (args.model == 'qanet'): model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, drop_prob_word=0.1, drop_prob_char=0.05, kernel_size_emb_enc_block=7, kernel_size_mod_enc_block=7, n_heads=args.n_heads) optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta_1, args.beta_2), eps=args.epsilon, weight_decay=args.l2_wd) elif (args.model == 'qanet_out'): model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, drop_prob_word=0.1, drop_prob_char=0.05, kernel_size_emb_enc_block=7, kernel_size_mod_enc_block=7, n_heads=args.n_heads) optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta_1, args.beta_2), eps=args.epsilon, weight_decay=args.l2_wd) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(): saver = utils.Saver(opt) # randomize seed opt.manualSeed = random.randint(1, 10000) # fix seed random.seed(opt.manualSeed) torch.manual_seed(opt.manualSeed) torch.cuda.manual_seed_all(opt.manualSeed) # load data root = "data/modelnet40_ply_hdf5_2048/" #"data/modelnet40_normal_resampled"# use_cuda = torch.cuda.is_available() transforms_list = [] random_permute = utils.Random_permute(opt.num_points, delta=opt.distance) # load transformations if opt.random_input: print("random_input") transforms_list.append(random_permute) # Load dataset / data loader train_dataset = data.ModelNetDataset( root, train=True, sort=opt.sort, transform=transforms.Compose(transforms_list), distance=opt.distance, normal=opt.normal) train_loader = DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=opt.workers) test_dataset = data.ModelNetDataset(root, train=False, sort=opt.sort, distance=opt.distance, normal=opt.normal) test_loader = DataLoader(test_dataset, batch_size=opt.batchSize, shuffle=False, num_workers=opt.workers) # define model ndim = 6 if opt.distance or opt.normal else 3 if opt.model == 'lstm': model = Baseline(input_dim=ndim, maxout=opt.elem_max) elif opt.model == 'lstm_mlp': model = LSTM_mlp(input_dim=ndim, maxout=opt.elem_max, mlp=[64, 128, 256, 512], fc=[512, 256, 40]) elif opt.model == 'test': model = Test(input_dim=ndim, maxout=opt.elem_max) # load speicified pre-trained model if opt.path != '': model.load_state_dict(torch.load(opt.path)) # define optimizer and loss function optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=1e-5) criterion = nn.CrossEntropyLoss() # transfer model and criterion to cuda if exist if use_cuda: model = model.cuda( ) #nn.DataParallel(model).cuda()#model.cuda() #nn.DataParallel(model).cuda() criterion = criterion.cuda() best_model_wts = model.state_dict() early_stopping = utils.Early_stopping(opt.early_stopping, patience=15) saver.log_parameters(model.parameters()) for epoch in range(opt.nepoch): adjust_learning_rate(optimizer, epoch, saver) train(model, optimizer, criterion, saver, train_loader, epoch) test_loss = test(model, criterion, saver, test_loader, epoch) early_stopping.update(test_loss) if early_stopping.stop(): break saver.save_result()
class Trainer(BaseTrainer): def __init__(self, config): super(Trainer, self).__init__(config) self.datamanager = DataManger(config["data"]) # model self.model = Baseline( num_classes=self.datamanager.datasource.get_num_classes("train") ) # summary model summary( self.model, input_size=(3, 256, 128), batch_size=config["data"]["batch_size"], device="cpu", ) # losses cfg_losses = config["losses"] self.criterion = Softmax_Triplet_loss( num_class=self.datamanager.datasource.get_num_classes("train"), margin=cfg_losses["margin"], epsilon=cfg_losses["epsilon"], use_gpu=self.use_gpu, ) self.center_loss = CenterLoss( num_classes=self.datamanager.datasource.get_num_classes("train"), feature_dim=2048, use_gpu=self.use_gpu, ) # optimizer cfg_optimizer = config["optimizer"] self.optimizer = torch.optim.Adam( self.model.parameters(), lr=cfg_optimizer["lr"], weight_decay=cfg_optimizer["weight_decay"], ) self.optimizer_centerloss = torch.optim.SGD( self.center_loss.parameters(), lr=0.5 ) # learing rate scheduler cfg_lr_scheduler = config["lr_scheduler"] self.lr_scheduler = WarmupMultiStepLR( self.optimizer, milestones=cfg_lr_scheduler["steps"], gamma=cfg_lr_scheduler["gamma"], warmup_factor=cfg_lr_scheduler["factor"], warmup_iters=cfg_lr_scheduler["iters"], warmup_method=cfg_lr_scheduler["method"], ) # track metric self.train_metrics = MetricTracker("loss", "accuracy") self.valid_metrics = MetricTracker("loss", "accuracy") # save best accuracy for function _save_checkpoint self.best_accuracy = None # send model to device self.model.to(self.device) self.scaler = GradScaler() # resume model from last checkpoint if config["resume"] != "": self._resume_checkpoint(config["resume"]) def train(self): for epoch in range(self.start_epoch, self.epochs + 1): result = self._train_epoch(epoch) if self.lr_scheduler is not None: self.lr_scheduler.step() result = self._valid_epoch(epoch) # add scalars to tensorboard self.writer.add_scalars( "Loss", { "Train": self.train_metrics.avg("loss"), "Val": self.valid_metrics.avg("loss"), }, global_step=epoch, ) self.writer.add_scalars( "Accuracy", { "Train": self.train_metrics.avg("accuracy"), "Val": self.valid_metrics.avg("accuracy"), }, global_step=epoch, ) # logging result to console log = {"epoch": epoch} log.update(result) for key, value in log.items(): self.logger.info(" {:15s}: {}".format(str(key), value)) # save model if ( self.best_accuracy == None or self.best_accuracy < self.valid_metrics.avg("accuracy") ): self.best_accuracy = self.valid_metrics.avg("accuracy") self._save_checkpoint(epoch, save_best=True) else: self._save_checkpoint(epoch, save_best=False) # save logs self._save_logs(epoch) def _train_epoch(self, epoch): """Training step""" self.model.train() self.train_metrics.reset() with tqdm(total=len(self.datamanager.get_dataloader("train"))) as epoch_pbar: epoch_pbar.set_description(f"Epoch {epoch}") for batch_idx, (data, labels, _) in enumerate( self.datamanager.get_dataloader("train") ): # push data to device data, labels = data.to(self.device), labels.to(self.device) # zero gradient self.optimizer.zero_grad() self.optimizer_centerloss.zero_grad() with autocast(): # forward batch score, feat = self.model(data) # calculate loss and accuracy loss = ( self.criterion(score, feat, labels) + self.center_loss(feat, labels) * self.config["losses"]["beta"] ) _, preds = torch.max(score.data, dim=1) # backward parameters # loss.backward() self.scaler.scale(loss).backward() # backward parameters for center_loss for param in self.center_loss.parameters(): param.grad.data *= 1.0 / self.config["losses"]["beta"] # optimize # self.optimizer.step() self.scaler.step(self.optimizer) self.optimizer_centerloss.step() self.scaler.update() # update loss and accuracy in MetricTracker self.train_metrics.update("loss", loss.item()) self.train_metrics.update( "accuracy", torch.sum(preds == labels.data).double().item() / data.size(0), ) # update process bar epoch_pbar.set_postfix( { "train_loss": self.train_metrics.avg("loss"), "train_acc": self.train_metrics.avg("accuracy"), } ) epoch_pbar.update(1) return self.train_metrics.result() def _valid_epoch(self, epoch): """Validation step""" self.model.eval() self.valid_metrics.reset() with torch.no_grad(): with tqdm(total=len(self.datamanager.get_dataloader("val"))) as epoch_pbar: epoch_pbar.set_description(f"Epoch {epoch}") for batch_idx, (data, labels, _) in enumerate( self.datamanager.get_dataloader("val") ): # push data to device data, labels = data.to(self.device), labels.to(self.device) with autocast(): # forward batch score, feat = self.model(data) # calculate loss and accuracy loss = ( self.criterion(score, feat, labels) + self.center_loss(feat, labels) * self.config["losses"]["beta"] ) _, preds = torch.max(score.data, dim=1) # update loss and accuracy in MetricTracker self.valid_metrics.update("loss", loss.item()) self.valid_metrics.update( "accuracy", torch.sum(preds == labels.data).double().item() / data.size(0), ) # update process bar epoch_pbar.set_postfix( { "val_loss": self.valid_metrics.avg("loss"), "val_acc": self.valid_metrics.avg("accuracy"), } ) epoch_pbar.update(1) return self.valid_metrics.result() def _save_checkpoint(self, epoch, save_best=True): """save model to file""" state = { "epoch": epoch, "state_dict": self.model.state_dict(), "center_loss": self.center_loss.state_dict(), "optimizer": self.optimizer.state_dict(), "optimizer_centerloss": self.optimizer_centerloss.state_dict(), "lr_scheduler": self.lr_scheduler.state_dict(), "best_accuracy": self.best_accuracy, } filename = os.path.join(self.checkpoint_dir, "model_last.pth") self.logger.info("Saving last model: model_last.pth ...") torch.save(state, filename) if save_best: filename = os.path.join(self.checkpoint_dir, "model_best.pth") self.logger.info("Saving current best: model_best.pth ...") torch.save(state, filename) def _resume_checkpoint(self, resume_path): """Load model from checkpoint""" if not os.path.exists(resume_path): raise FileExistsError("Resume path not exist!") self.logger.info("Loading checkpoint: {} ...".format(resume_path)) checkpoint = torch.load(resume_path, map_location=self.map_location) self.start_epoch = checkpoint["epoch"] + 1 self.model.load_state_dict(checkpoint["state_dict"]) self.center_loss.load_state_dict(checkpoint["center_loss"]) self.optimizer.load_state_dict(checkpoint["optimizer"]) self.optimizer_centerloss.load_state_dict(checkpoint["optimizer_centerloss"]) self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) self.best_accuracy = checkpoint["best_accuracy"] self.logger.info( "Checkpoint loaded. Resume training from epoch {}".format(self.start_epoch) ) def _save_logs(self, epoch): """Save logs from google colab to google drive""" if os.path.isdir(self.logs_dir_saved): shutil.rmtree(self.logs_dir_saved) destination = shutil.copytree(self.logs_dir, self.logs_dir_saved)