def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name, num_epochs): """ Train and evaluate a net. """ # Initialize logs fname = os.path.join(args.model_dir, f'train{fold}.log') logging_train = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'lr{fold}.log') logging_lr = myutils.setup_logger(fname) # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load initial weights net = net.to(device) #best_net_wts = copy.deepcopy(net.state_dict()) best_acc, epoch = 0.0, 1 # Initialize .tar files to save settings fname = f'last{fold}.tar' last_path = os.path.join(args.model_dir, fname) fname = f'best{fold}.tar' best_path = os.path.join(args.model_dir, fname) # To resume training for more epochs if args.resume: try: # Load last settings from .tar file last_checkpoint = torch.load(last_path) net.load_state_dict(last_checkpoint['net_state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer_state_dict']) epoch = last_checkpoint[ 'epoch'] + 1 # Since last epoch was saved we start with the next one logging_process.info( f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}' ) # Load best settings from .tar file best_checkpoint = torch.load(best_path) #best_net_wts = best_checkpoint['net_state_dict'] best_acc = best_checkpoint['acc'] except FileNotFoundError as err: # This error happens when folds are present # If interrupted on fold 1 then best best_checkpoint for fold 2 does # not exists. This is fixed like this. logging_process.info(f'Model: {args.model_dir}\tError: {err}') # TRAINING LOOP for epoch in range(epoch, num_epochs + 1): print(f'Epoch {epoch}/{num_epochs}') logging_train.info(f'Epoch {epoch}/{num_epochs}') # Each epoch has a training phase and a validation phase for phase in ['train', 'val']: if phase == 'train': net.train() # Set net to training mode mylr_value = optimizer.param_groups[0]['lr'] logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}') else: net.eval() # Set net to evaluate mode # Track statistics running_loss = 0.0 running_corrects = 0 # Iterate over data for index, inputs, labels in tqdm(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward # Track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = net(inputs) _, targets = torch.max(labels, 1) _, preds = torch.max(outputs, 1) #if net_name.startswith('vgg16_ft_no_soft'): # outputs = torch.reshape(outputs, (-1,)) # reshape added for binary # loss = criterion(outputs, targets.float()) # float added for binary #else: loss = criterion(outputs, targets) # Backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # Batch statistics running_loss += loss.detach().item() * inputs.size( 0) # This is batch loss running_corrects += torch.sum( preds == targets.data) # This is batch accuracy # efficientnetb if net_name.startswith('efficientnetb'): if phase == 'train': scheduler.step() # inceptionv if net_name.startswith('inceptionv'): if phase == 'train': if (epoch % 2) == 0: scheduler.step() # Epoch statistics epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) if phase == 'val': # Save last settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss }, last_path) if epoch_acc > best_acc: best_acc = epoch_acc #best_net_wts = net.state_dict() # Save best settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), #best_net_wts 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, 'acc': best_acc }, best_path) # Save best settings to .json file best_metrics = { f'loss{fold}': epoch_loss, f'acc{fold}': best_acc.item() } fname = os.path.join(args.model_dir, f'metrics{fold}.json') with open(fname, 'w') as f: f.write(json.dumps(best_metrics)) # vgg if net_name.startswith('vgg'): scheduler.step(epoch_acc) # resnet if net_name.startswith('resnet'): scheduler.step(epoch_loss) print('Best val Acc: {:4f}'.format(best_acc)) logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format( args.model_dir, fold, best_acc))
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name, num_epochs): """ Train and evaluate a net. """ # Initialize logs fname = os.path.join(args.model_dir, f'train{fold}.log') logging_train = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'lr{fold}.log') logging_lr = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'bins{fold}.log') logging_bins = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'cats{fold}.log') logging_cats = myutils.setup_logger(fname) # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load initial weights net = net.to(device) #best_net_wts = copy.deepcopy(net.state_dict()) best_acc, epoch = 0.0, 1 # Initialize .tar files to save settings fname = f'last{fold}.tar' last_path = os.path.join(args.model_dir, fname) fname = f'best{fold}.tar' best_path = os.path.join(args.model_dir, fname) # To resume training for more epochs if args.resume: try: # Load last settings from .tar file last_checkpoint = torch.load(last_path) net.load_state_dict(last_checkpoint['net_state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer_state_dict']) epoch = last_checkpoint[ 'epoch'] + 1 # Since last epoch was saved we start with the next one logging_process.info( f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}' ) # Load best settings from .tar file best_checkpoint = torch.load(best_path) #best_net_wts = best_checkpoint['net_state_dict'] best_acc = best_checkpoint['acc'] except FileNotFoundError as err: # This error happens when folds are present # If interrupted on fold 1 then best best_checkpoint for fold 2 does # not exists. This is fixed like this. logging_process.info(f'Model: {args.model_dir}\tError: {err}') # TRAINING LOOP for epoch in range(epoch, num_epochs + 1): print(f'Epoch {epoch}/{num_epochs}') # To track values in each epoch tloss, tacc, vloss, vacc = '', '', '', '' tloss0, tacc0, vloss0, vacc0 = '', '', '', '' tloss1, tacc2, vloss3, vacc4 = '', '', '', '' # Each epoch has a training phase and a validation phase for phase in ['train', 'val']: if phase == 'train': net.train() # Set net to training mode # Track learning rate for plot mylr_value = optimizer.param_groups[0]['lr'] logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}') else: net.eval() # Set net to evaluate mode # Track statistics running_loss0 = 0.0 running_loss1 = 0.0 running_corrects0 = 0 running_corrects1 = 0 # Iterate over data for index, inputs, bins_labels, cats_labels in tqdm( dataloaders[phase]): inputs = inputs.to(device) bins_labels = bins_labels.to(device) cats_labels = cats_labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward # Track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs_bins, outputs_cats = net(inputs) #outputs_bins = torch.reshape(outputs_bins, (-1,)) # reshape added for binary outputs_bins = outputs_bins.to(device) outputs_cats = outputs_cats.to(device) #loss0 = criterion[0](outputs_bins, bins_labels.float())# float added for binary loss0 = criterion[0](outputs_bins, bins_labels) loss1 = criterion[1](outputs_cats, cats_labels) loss0 = loss0 * (2 / 307) loss1 = loss1 * (305 / 307) #loss0 = loss0 * (2/306) #loss1 = loss1 * (304/306) # Backward + optimize only if in training phase if phase == 'train': loss = (loss0 + loss1) / 2 loss.backward() optimizer.step() # Batch statistics running_loss0 += loss0.detach().item() * inputs.size(0) running_loss1 += loss1.detach().item() * inputs.size(0) #running_corrects0 += torch.sum(torch.round(outputs_bins) == bins_labels.data) running_corrects0 += torch.sum( torch.max(outputs_bins, 1)[1] == bins_labels.data) running_corrects1 += torch.sum( torch.max(outputs_cats, 1)[1] == cats_labels.data) # efficientnetb #if net_name.startswith('efficientnetb'): # if phase == 'train': # scheduler.step() # inceptionv #if net_name.startswith('inceptionv'): # if phase == 'train': # if (epoch % 2) == 0: # scheduler.step() # Epoch statistics epoch_loss0 = running_loss0 / dataset_sizes[phase] epoch_loss1 = running_loss1 / dataset_sizes[phase] epoch_loss = epoch_loss0 + epoch_loss1 epoch_acc0 = (running_corrects0.double() / dataset_sizes[phase]) * (2 / 307) epoch_acc1 = (running_corrects1.double() / dataset_sizes[phase]) * (305 / 307) epoch_acc = (epoch_acc0 + epoch_acc1) / 2 #print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) #logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) print('{} bin_loss: {:.4f} bin_acc: {:.4f}'.format( phase, epoch_loss0, epoch_acc0)) #logging_bins.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss0, epoch_acc0)) print('{} cat_loss: {:.4f} cat_acc: {:.4f}'.format( phase, epoch_loss1, epoch_acc1)) #logging_cats.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss1, epoch_acc1)) if phase == 'train': tloss = epoch_loss tloss0 = epoch_loss0 tloss1 = epoch_loss1 tacc = epoch_acc tacc0 = epoch_acc0 tacc1 = epoch_acc1 if phase == 'val': vloss = epoch_loss vloss0 = epoch_loss0 vloss1 = epoch_loss1 vacc = epoch_acc vacc0 = epoch_acc0 vacc1 = epoch_acc1 logging_train.info( 'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}' .format(epoch, tloss, tacc, vloss, vacc)) logging_bins.info( 'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}' .format(epoch, tloss0, tacc0, vloss0, vacc0)) logging_cats.info( 'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}' .format(epoch, tloss1, tacc1, vloss1, vacc1)) # Save last settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss }, last_path) if epoch_acc > best_acc: best_acc = epoch_acc #best_net_wts = net.state_dict() # Save best settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), #best_net_wts 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, 'acc': best_acc }, best_path) # Save best settings to .json file best_metrics = { f'loss{fold}': epoch_loss, f'acc{fold}': best_acc.item() } fname = os.path.join(args.model_dir, f'metrics{fold}.json') with open(fname, 'w') as f: f.write(json.dumps(best_metrics)) #vgg if net_name.startswith('vgg'): scheduler.step(epoch_acc) # resnet #if net_name.startswith('resnet'): # scheduler.step(epoch_loss) print('Best val Acc: {:4f}'.format(best_acc)) logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format( args.model_dir, fold, best_acc))
args.data_dir), "Could not find the dataset at {}".format( args.data_dir) assert os.path.isdir( args.model_dir), "Could not find the model at {}".format( args.model_dir) assert os.path.isdir( args.net_dir), "Could not find the network at {}".format(args.net_dir) # Initialize main log folder logs_dir_path = os.path.join(os.getcwd(), 'Logs') if not os.path.exists(logs_dir_path): os.mkdir(logs_dir_path) # Initialize main log file log_file = os.path.join(logs_dir_path, 'process.log') logging_process = myutils.setup_logger(log_file, date=True) # Save commandline settings to log script_activated = ' '.join(sys.argv) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging_process.info(f'Script: {script_activated}, device: {device}') # Get the experiment parameters params_file = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( params_file), "No json configuration file found at {}".format( params_file) params = myutils.Params(params_file) # FOLD LOOP dfs = {}
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, num_epochs): """ Train and evaluate a net. """ # Initialize logs fname = os.path.join(args.model_dir, f'train{fold}.log') logging_train = myutils.setup_logger(fname) # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load initial weights net = net.to(device) best_net_wts = copy.deepcopy(net.state_dict()) best_acc, epoch = 0.0, 1 # Initialize .tar files to save settings fname = f'last{fold}.tar' last_path = os.path.join(args.model_dir, fname) fname = f'best{fold}.tar' best_path = os.path.join(args.model_dir, fname) # To resume training for more epochs if args.resume: try: # Load last settings from .tar file last_checkpoint = torch.load(last_path) net.load_state_dict(last_checkpoint['net_state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer_state_dict']) epoch = last_checkpoint[ 'epoch'] + 1 # Since last epoch was saved we start with the next one logging_process.info( f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}' ) # Load best settings from .tar file best_checkpoint = torch.load(best_path) best_net_wts = best_checkpoint['net_state_dict'] best_acc = best_checkpoint['acc'] except FileNotFoundError as err: # This error happens when folds are present # If interrupted on fold 1 then best best_checkpoint for fold 2 does # not exists. This is fixed like this. logging_process.info(f'Model: {args.model_dir}\tError: {err}') # Initialize early stop settings best_val_loss, epochs_no_improve, patience = np.Inf, 0, 5 # TRAINING LOOP for epoch in range(epoch, num_epochs + 1): # Early stop if epochs_no_improve == patience: print('Early stop') logging_process.info( f'Model: {args.model_dir}\tFold:{fold}\tEarly stop: {epoch}') break print(f'Epoch {epoch}/{num_epochs}') logging_train.info(f'Epoch {epoch}/{num_epochs}') # Each epoch has a training phase and validation phase for phase in ['train', 'val']: if phase == 'train': net.train() # Set net to training mode else: net.eval() # Set net to evaluate mode # Track statistics running_loss = 0.0 running_corrects = 0 # Iterate over data for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward # Track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = net(inputs) probs, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # Backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # Batch statistics running_loss += loss.item() * inputs.size( 0) # This is batch loss running_corrects += torch.sum( preds == labels.data) # This is batch accuracy if phase == 'train': scheduler.step() # Epoch statistics epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) if phase == 'val': # Best loss tracking for early stop if epoch_loss < best_val_loss: best_val_loss = epoch_loss epochs_no_improve = 0 else: epochs_no_improve += 1 # Save last settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss }, last_path) if epoch_acc > best_acc: best_acc = epoch_acc best_net_wts = net.state_dict() # Save best settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': best_net_wts, 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, 'acc': best_acc }, best_path) # Save best settings to .json file best_metrics = { f'loss{fold}': epoch_loss, f'acc{fold}': best_acc.item() } fname = os.path.join(args.model_dir, f'metrics{fold}.json') with open(fname, 'w') as f: f.write(json.dumps(best_metrics)) print('Best val Acc: {:4f}'.format(best_acc)) logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format( args.model_dir, fold, best_acc))