def validation(validation_data, model, global_step, t_vocab_size, val_writer, opt): model.eval() total_loss = 0.0 total_cnt = 0 for batch in validation_data: inputs, i_mask = None, None if opt.has_inputs: inputs = batch.src i_mask = utils.create_pad_mask(inputs, opt.src_pad_idx) targets = batch.trg t_mask = utils.create_pad_mask(targets, opt.trg_pad_idx) t_self_mask = utils.create_trg_self_mask(targets) with torch.no_grad(): pred = model(inputs, targets, i_mask, t_self_mask, t_mask) pred = pred.view(-1, pred.size(-1)) ans = targets.view(-1) loss = utils.get_loss(pred, ans, t_vocab_size, 0, opt.trg_pad_idx) total_loss += loss.item() * len(batch) total_cnt += len(batch) val_loss = total_loss / total_cnt print("Validation Loss", val_loss) val_writer.add_scalar('loss', val_loss, global_step) return val_loss
def train(train_data, model, opt, global_step, optimizer, t_vocab_size, label_smoothing, writer): model.train() last_time = time.time() pbar = tqdm(total=len(train_data.dataset), ascii=True) for batch in train_data: inputs = None if opt.has_inputs: inputs = batch.src targets = batch.trg pred = model(inputs, targets) pred = pred.view(-1, pred.size(-1)) ans = targets.view(-1) loss = utils.get_loss(pred, ans, t_vocab_size, label_smoothing, opt.trg_pad_idx) optimizer.zero_grad() loss.backward() optimizer.step() if global_step % 100 == 0: summarize_train(writer, global_step, last_time, model, opt, inputs, targets, optimizer, loss, pred, ans) last_time = time.time() pbar.set_description('[Loss: {:.4f}]'.format(loss.item())) global_step += 1 pbar.update(targets.size(0)) pbar.close() train_data.reload_examples() return global_step
def run(cfg): '''Load save path''' cfg.log_string('Data save path: %s' % (cfg.save_path)) checkpoint = CheckpointIO(cfg) '''Load device''' cfg.log_string('Loading device settings.') device = load_device(cfg) '''Load data''' cfg.log_string('Loading dataset.') train_loader = get_dataloader(cfg.config, mode='train') test_loader = get_dataloader(cfg.config, mode='test') '''Load net''' cfg.log_string('Loading model.') net = get_model(cfg.config, device=device) if isinstance(net, list): checkpoint.register_modules(voxnet=net[0]) checkpoint.register_modules(refnet=net[1]) else: checkpoint.register_modules(voxnet=net) cfg.log_string('loading loss function') loss_func = get_loss(cfg.config, device) '''Load optimizer''' cfg.log_string('Loading optimizer.') optimizer = get_optimizer(config=cfg.config, net=net) if isinstance(net, list): checkpoint.register_modules(voxopt=optimizer[0]) checkpoint.register_modules(refopt=optimizer[1]) else: checkpoint.register_modules(voxopt=optimizer) '''Load scheduler''' cfg.log_string('Loading optimizer scheduler.') scheduler = load_scheduler(config=cfg.config, optimizer=optimizer) if isinstance(net, list): checkpoint.register_modules(voxsch=scheduler[0]) checkpoint.register_modules(refsch=scheduler[1]) else: checkpoint.register_modules(voxsch=scheduler) '''Load trainer''' cfg.log_string('Loading trainer.') trainer = get_trainer(cfg.config) '''Start to train''' cfg.log_string('Start to train.') #cfg.log_string('Total number of parameters in {0:s}: {1:d}.'.format(cfg.config['method'], sum(p.numel() for p in net.parameters()))) trainer(cfg, net, loss_func, optimizer, scheduler, train_loader=train_loader, test_loader=test_loader, device=device, checkpoint=checkpoint) cfg.log_string('Training finished.')
def train(model, dataloader, device, optimizer_name, loss_name, lr): optimizer_object = get_optimizer(optimizer_name) optimizer = optimizer_object(model.parameters(), lr=lr) loss_fn = get_loss(loss_name) model.train() running_loss = 0.0 running_corrects = 0 for inputs, targets in dataloader: inputs = inputs.to(device) targets = targets.to(device) bs = len(targets) classes = torch.zeros((bs, 10)) for i in range(bs): classes[i][targets[i]] = 1 classes = classes.to(device) outputs = model(inputs) loss = loss_fn()(outputs, classes) # LeCun & al. used Maximum Log Likehood optimizer.zero_grad() loss.backward() optimizer.step() _, preds = torch.max(outputs.data, 1) # statistics running_loss += loss.item() running_corrects += torch.sum(preds == targets.data) loss = running_loss / 60000 acc = running_corrects.data.item() / 60000 print('Training results: Loss: {:.4f} Acc: {:.4f}'.format(loss, acc)) return acc
def train(model, dataloader, device, optimizer_name, loss_name, lr, verbose): optimizer_object = get_optimizer(optimizer_name) optimizer = optimizer_object(model.parameters(), lr=lr) loss_fn = get_loss(loss_name) model.train() running_loss = 0.0 running_corrects = 0 for inputs, targets in dataloader: inputs = inputs.to(device) targets = targets.to(device) bs = len(targets) classes = torch.zeros((bs, 10)) for i in range(bs): classes[i][targets[i]] = 1 classes = classes.to(device) outputs = model(inputs) loss = loss_fn()(outputs, classes) optimizer.zero_grad() loss.backward() optimizer.step() _, preds = torch.max(outputs.data, 1) running_loss += loss.item() running_corrects += torch.sum(preds == targets.data) loss = running_loss / 60000 acc = running_corrects.data.item() / 60000 if verbose: print(f'Training results: Loss: {loss:.4f} Acc: {acc:.4f}') return acc
def train(args): # Get hardware device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Check if weights and biases integration is enabled. if args.wandb == 1: import wandb wandb.init(entity='surajpai', project='FacialEmotionRecognition', config=vars(args)) # Get the dataset with "Training" usage. dataset = FER2013Dataset(args.data_path, "Training") # Randomly split the dataset into train and validation based on the specified train_split argument train_dataset, validation_dataset = torch.utils.data.random_split( dataset, [ int(len(dataset) * args.train_split), len(dataset) - int(len(dataset) * args.train_split) ]) logging.info( 'Samples in the training set: {}\n Samples in the validation set: {} \n\n' .format(len(train_dataset), len(validation_dataset))) # Get class weights as inverse of frequencies from class occurences in the dataset. dataset_summary = dataset.get_summary_statistics() class_weights = (1 / dataset_summary["class_occurences"]) class_weights = torch.Tensor(class_weights / np.sum(class_weights)).to(device) # Train loader and validation loader initialized with batch_size as specified and randomly shuffled train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True) val_loader = DataLoader(validation_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True) # Model initialization model = torch.nn.DataParallel(Model(args.model_config)).to(device) # Set torch optimizer optimizer = torch.optim.Adam(model.parameters(), ) # Get loss for training the network from the utils get_loss function criterion = get_loss(args, class_weights) bestLoss = -1000 # Create metric logger object metrics = Metrics(upload=args.wandb) # Define augmentation transforms, if --augment is enabled if args.augment == 1: transform = transforms.RandomChoice([ transforms.RandomHorizontalFlip(p=0.75), transforms.RandomAffine(15, translate=(0.1, 0.1), scale=(1.2, 1.2), shear=15), transforms.ColorJitter() ]) # Start iterating over the total number of epochs set by epochs argument for n_epoch in range(args.epochs): # Reset running metrics at the beginning of each epoch. metrics.reset() # Utils logger logging.info(' Starting Epoch: {}/{} \n'.format(n_epoch, args.epochs)) ''' TRAINING ''' # Model in train mode for batch-norm and dropout related ops. model.train() # Iterate over each batch in the train loader for idx, batch in enumerate(tqdm(train_loader)): # Reset gradients optimizer.zero_grad() # Apply augmentation transforms, if --augment is enabled if args.augment == 1 and n_epoch % 2 == 0: batch = apply_transforms(batch, transform) # Move the batch to the device, needed explicitly if GPU is present image, target = batch["image"].to(device), batch["emotion"].to( device) # Run a forward pass over images from the batch out = model(image) # Calculate loss based on the criterion set loss = criterion(out, target) # Backward pass from the final loss loss.backward() # Update the optimizer optimizer.step() # Update metrics for this batch metrics.update_train({ "loss": loss.item(), "predicted": out, "ground_truth": target }) ''' VALIDATION ''' logging.info(' Validating on the validation split ... \n \n') # Model in eval mode. model.eval() # Set no grad to disable gradient saving. with torch.no_grad(): # Iterate over each batch in the val loader for idx, batch in enumerate(val_loader): # Move the batch to the device, needed explicitly if GPU is present image, target = batch["image"].to(device), batch["emotion"].to( device) # Forward pass out = model(image) # Calculate loss based on the criterion set loss = criterion(out, target) # Metrics and sample predictions updated for validation batch metrics.update_val({ "loss": loss.item(), "predicted": out, "ground_truth": target, "image": image, "class_mapping": dataset.get_class_mapping() }) # Display metrics at the end of each epoch metrics.display() # Weight Checkpointing to save the best model on validation loss save_path = "./saved_models/{}.pth.tar".format( args.model_config.split('/')[-1].split('.')[0]) bestLoss = min(bestLoss, metrics.metric_dict["loss@val"]) is_best = (bestLoss == metrics.metric_dict["loss@val"]) save_checkpoint( { 'epoch': n_epoch, 'state_dict': model.state_dict(), 'bestLoss': bestLoss, 'optimizer': optimizer.state_dict(), }, is_best, save_path) # After training is completed, if weights and biases is enabled, visualize filters and upload final model. if args.wandb == 1: visualize_filters(model.modules()) wandb.save(save_path) # Get report from the metrics logger train_report, val_report = metrics.get_report() # Save the report to csv files train_report.to_csv("{}_trainreport.csv".format( save_path.rstrip(".pth.tar"))) val_report.to_csv("{}_valreport.csv".format(save_path.rstrip(".pth.tar")))