def load_data(data_dir): """ Function that takes a folder, finds all .jpg files inside the folder, and creates a dataframe. """ # Reproducibility myutils.myseed(seed=42) # Get the image paths filenames = myutils.run_fast_scandir(data_dir, [".jpg"]) df = pd.DataFrame(data=filenames, columns=['filenames']) # Get the label from nth folder starting from the parent: outlevel = 4 # fname = '/scratch/s181423_data/data_bin/label/image.jpg' df['label'] = df['filenames'].apply(lambda x: x.split('/')[outlevel]) # Get the id from the basename df['id'] = df['filenames'].apply(lambda x: os.path.basename(x)) # Get label as one hot encoded values df = df.set_index(['id', 'filenames']) df['label'] = df['label'].astype('category') df = pd.get_dummies(df, prefix='', prefix_sep='') df = df.reset_index() # Save the data as a .csv file df.to_csv(f'{data_dir}.csv', index=False) logging_data_process.info(f'Saved: {data_dir}.csv')
def get_loaders(dfs, size=100, batch_size=1, num_workers=1): """ Function that takes a dictionary of dataframes and returns 2 dictionaries of pytorch dataloaders and dataset_sizes """ # Reproducibility myutils.myseed(seed=42) # Custom pytorch dataloader for this dataset class Derm(Dataset): """ Read a pandas dataframe with images paths and labels """ def __init__(self, df, transform=None): self.df = df self.transform = transform def __len__(self): return len(self.df) def __getitem__(self, index): try: # Load image data and get label X = Image.open(self.df['image_path'][index]).convert('RGB') y = torch.tensor(int(self.df['label_code'][index])) print(f"{self.df['image_path'][index]}\t{self.df['label_code'][index]}") except IOError as err: pass if self.transform: X = self.transform(X) return X, y # ImageNet statistics mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] # Transforms data_transforms = {'train' : transforms.Compose([transforms.Resize(size), transforms.CenterCrop((size,size)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean,std)]), 'val' : transforms.Compose([transforms.Resize(size), transforms.CenterCrop((size,size)), transforms.ToTensor(), transforms.Normalize(mean,std)])} # Sets image_datasets = {x: Derm(dfs[x], transform=data_transforms[x]) for x in dfs.keys()} # Sizes dataset_sizes = {x: len(image_datasets[x]) for x in dfs.keys()} # Loaders dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers) for x in dfs.keys()} return dataloaders, dataset_sizes
def get_all_weights(data_dir, folds): """ Function that creates a dictionary with the RGB means and stds. """ # Reproducibility myutils.myseed(seed=42) weights_dict = {} for fold in range(1, folds+1): path = os.path.join(data_dir, f'train{fold}.csv') train = pd.read_csv(path) weights = myutils.get_weights(train) weights_dict[f'weights{fold}'] = weights # Save stats_dict to a .json file with open('dicts/weights_dict.json', 'w') as f: f.write(json.dumps(weights_dict)) logging_data_process.info('Saved: dicts/weights_dict.json')
def load_data(data_dir): """ Function that takes a folder, finds all .jpg files inside the folder, and creates a dataframe. """ # Reproducibility myutils.myseed(seed=42) # Get the image paths filenames = myutils.run_fast_scandir(data_dir, [".jpg"]) df = pd.DataFrame(data=filenames, columns=['filenames']) # Get the label from nth folder starting from the parent: outlevel = 4 # fname = '/scratch/s181423/data/label/image.jpg' df['label'] = df['filenames'].apply(lambda x: x.split('/')[outlevel]) # Resample the minority classes to have them in training and testing counts_df = pd.DataFrame(df['label'].value_counts()) labels_with_one_example = list(counts_df[counts_df['label'] < 2].index) duplicates_df = df[df['label'].isin(labels_with_one_example)] # 5-plicate df for stratify 20% of 5 is 1, for 80% train, 20% test df_copy = duplicates_df df = pd.concat([df, df_copy, df_copy, df_copy, df_copy]) # Get the id from the basename df['id'] = df['filenames'].apply(lambda x: os.path.basename(x)) # Get label as one hot encoded values df = df.set_index(['id','filenames']) df['label'] = df['label'].astype('category') mapping = dict(enumerate(df['label'].cat.categories )) df['label'] = pd.Categorical(df['label']).codes #df = pd.get_dummies(df, prefix='', prefix_sep='') df = df.reset_index() # Save the data as a .csv file df.to_csv(f'{data_dir}.csv', index=False) logging_data_process.info(f'Saved: {data_dir}.csv') # Save the mappings as a .json file with open('dicts/mapping.json', 'w') as f: f.write(json.dumps(mapping)) logging_data_process.info('Saved: dicts/mapping.json')
def load_data(data_dir): """ Function that takes a folder, finds all .jpg files inside the folder, and creates a dataframe. """ # Reproducibility myutils.myseed(seed=42) # Get the image paths filenames = myutils.run_fast_scandir(data_dir, [".jpg"]) df1 = pd.DataFrame(data=filenames, columns=['image_path']) df1['image_id'] = df1['image_path'].apply( lambda x: os.path.splitext(os.path.basename(x))[0]) df1 = df1.set_index('image_id') # Get the labels fname = os.path.join(data_dir, 'labels.csv') df2 = pd.read_csv(fname) df2 = df2.set_index('image') # Do not move function from here def get_disease(row): for c in df2.columns: if row[c] == 1: return c df2 = df2.apply(get_disease, axis=1).to_frame(name='label') df = pd.merge(df1, df2, left_index=True, right_index=True) df['label'] = df['label'].astype('category') mapping = dict(enumerate(df['label'].cat.categories)) df['label_code'] = pd.Categorical(df['label']).codes # Save the data as a .csv file df.to_csv(f'{data_dir}.csv', index=False) logging_data_process.info(f'Saved: {data_dir}.csv') # Save the mapping as a .json file with open(f'{data_dir}.json', 'w') as f: f.write(json.dumps(mapping)) logging_data_process.info(f'Saved: {data_dir}.json')
def eval(file, dataloaders, dataset_sizes, net): """ Evaluate a net. """ # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load network and restore settings from .tar file net = net.to(device) fname = f'{args.restore_file}.tar' restore_path = os.path.join(args.model_dir, fname) checkpoint = torch.load(restore_path) net.load_state_dict(checkpoint['net_state_dict']) net.eval() # Validation phase phase = 'val' with torch.no_grad(): #indexes, predictions, probabilities, all_probabilities, in_labels, in_targets = [],[],[],[],[],[] indexes, predictions, probabilities, all_probabilities, in_labels = [],[],[],[],[] for index, inputs, labels in tqdm(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) outputs = net(inputs) #_, targets = torch.max(labels, 1) probs, preds = torch.max(outputs, 1) indexes.extend(index.cpu().detach().numpy()) all_probabilities.extend(outputs.cpu().detach().numpy()) probabilities.extend(probs.cpu().detach().numpy()) predictions.extend(preds.cpu().detach().numpy()) in_labels.extend(labels.cpu().detach().numpy()) #in_targets.extend(targets.cpu().detach().numpy()) #return indexes, probabilities, predictions, all_probabilities, in_labels, in_targets return indexes, probabilities, predictions, all_probabilities, in_labels
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name, num_epochs): """ Train and evaluate a net. """ # Initialize logs fname = os.path.join(args.model_dir, f'train{fold}.log') logging_train = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'lr{fold}.log') logging_lr = myutils.setup_logger(fname) # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load initial weights net = net.to(device) #best_net_wts = copy.deepcopy(net.state_dict()) best_acc, epoch = 0.0, 1 # Initialize .tar files to save settings fname = f'last{fold}.tar' last_path = os.path.join(args.model_dir, fname) fname = f'best{fold}.tar' best_path = os.path.join(args.model_dir, fname) # To resume training for more epochs if args.resume: try: # Load last settings from .tar file last_checkpoint = torch.load(last_path) net.load_state_dict(last_checkpoint['net_state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer_state_dict']) epoch = last_checkpoint[ 'epoch'] + 1 # Since last epoch was saved we start with the next one logging_process.info( f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}' ) # Load best settings from .tar file best_checkpoint = torch.load(best_path) #best_net_wts = best_checkpoint['net_state_dict'] best_acc = best_checkpoint['acc'] except FileNotFoundError as err: # This error happens when folds are present # If interrupted on fold 1 then best best_checkpoint for fold 2 does # not exists. This is fixed like this. logging_process.info(f'Model: {args.model_dir}\tError: {err}') # TRAINING LOOP for epoch in range(epoch, num_epochs + 1): print(f'Epoch {epoch}/{num_epochs}') logging_train.info(f'Epoch {epoch}/{num_epochs}') # Each epoch has a training phase and a validation phase for phase in ['train', 'val']: if phase == 'train': net.train() # Set net to training mode mylr_value = optimizer.param_groups[0]['lr'] logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}') else: net.eval() # Set net to evaluate mode # Track statistics running_loss = 0.0 running_corrects = 0 # Iterate over data for index, inputs, labels in tqdm(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward # Track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = net(inputs) _, targets = torch.max(labels, 1) _, preds = torch.max(outputs, 1) #if net_name.startswith('vgg16_ft_no_soft'): # outputs = torch.reshape(outputs, (-1,)) # reshape added for binary # loss = criterion(outputs, targets.float()) # float added for binary #else: loss = criterion(outputs, targets) # Backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # Batch statistics running_loss += loss.detach().item() * inputs.size( 0) # This is batch loss running_corrects += torch.sum( preds == targets.data) # This is batch accuracy # efficientnetb if net_name.startswith('efficientnetb'): if phase == 'train': scheduler.step() # inceptionv if net_name.startswith('inceptionv'): if phase == 'train': if (epoch % 2) == 0: scheduler.step() # Epoch statistics epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) if phase == 'val': # Save last settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss }, last_path) if epoch_acc > best_acc: best_acc = epoch_acc #best_net_wts = net.state_dict() # Save best settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), #best_net_wts 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, 'acc': best_acc }, best_path) # Save best settings to .json file best_metrics = { f'loss{fold}': epoch_loss, f'acc{fold}': best_acc.item() } fname = os.path.join(args.model_dir, f'metrics{fold}.json') with open(fname, 'w') as f: f.write(json.dumps(best_metrics)) # vgg if net_name.startswith('vgg'): scheduler.step(epoch_acc) # resnet if net_name.startswith('resnet'): scheduler.step(epoch_loss) print('Best val Acc: {:4f}'.format(best_acc)) logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format( args.model_dir, fold, best_acc))
def data_split(data_dir, folds): """ Function that takes a data_dir and a number of folds, and splits images in data_dir into training(80%) and testing(20%) data. For fit.py training data is further splitted into training and validation sets. If cross validation is needed, training data is also splitted into train and validation folds. """ # Reproducibility myutils.myseed(seed=42) seed = 42 # Load the data with image paths and labels df = pd.read_csv(f'{data_dir}.csv') df = df.sample(frac=1, random_state=seed).reset_index(drop=True) logging_data_process.info(f"all data size:{len(df)}") # Test train_val, test = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True) train_val, test = train_val.reset_index(drop=True), test.reset_index( drop=True) test.to_csv(os.path.join(data_dir, 'test.csv'), index=False) logging_data_process.info(f"test size:{len(test)}") logging_data_process.info(f"train_val size:{len(train_val)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}") # Train and validation #X = train_val[['id','filenames']] #y = train_val.iloc[:,2:].apply(lambda x: np.argmax(x), axis=1) # argmax is necessary for stratification # categories did not allow stratify because some classes have just 1 example train, val = train_test_split(train_val, test_size=0.2, random_state=seed, shuffle=True) train, val = train.reset_index(drop=True), val.reset_index(drop=True) train.to_csv(os.path.join(data_dir, 'train.csv'), index=False) val.to_csv(os.path.join(data_dir, 'val.csv'), index=False) logging_data_process.info(f"train size:{len(train)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, 'train.csv')}") logging_data_process.info(f"val size:{len(val)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, 'val.csv')}") # Cross validation folds if folds > 1: logging_data_process.info(f'Folds: {folds}') X = train_val[['id', 'filenames']] y = train_val.iloc[:, 2:].apply( lambda x: np.argmax(x), axis=1) # argmax is necessary for stratification skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True) fold = 0 for train_idx, val_idx in skf.split(X, y): fold += 1 train_idx, val_idx = list(train_idx), list(val_idx) train, val = train_val.iloc[train_idx, :], train_val.iloc[ val_idx, :] train, val = train.reset_index(drop=True), val.reset_index( drop=True) train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False) val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False) logging_data_process.info(f"train{fold} size:{len(train)}") logging_data_process.info( f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}") logging_data_process.info(f"val{fold} size:{len(val)}") logging_data_process.info( f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name, num_epochs): """ Train and evaluate a net. """ # Initialize logs fname = os.path.join(args.model_dir, f'train{fold}.log') logging_train = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'lr{fold}.log') logging_lr = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'bins{fold}.log') logging_bins = myutils.setup_logger(fname) fname = os.path.join(args.model_dir, f'cats{fold}.log') logging_cats = myutils.setup_logger(fname) # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load initial weights net = net.to(device) #best_net_wts = copy.deepcopy(net.state_dict()) best_acc, epoch = 0.0, 1 # Initialize .tar files to save settings fname = f'last{fold}.tar' last_path = os.path.join(args.model_dir, fname) fname = f'best{fold}.tar' best_path = os.path.join(args.model_dir, fname) # To resume training for more epochs if args.resume: try: # Load last settings from .tar file last_checkpoint = torch.load(last_path) net.load_state_dict(last_checkpoint['net_state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer_state_dict']) epoch = last_checkpoint[ 'epoch'] + 1 # Since last epoch was saved we start with the next one logging_process.info( f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}' ) # Load best settings from .tar file best_checkpoint = torch.load(best_path) #best_net_wts = best_checkpoint['net_state_dict'] best_acc = best_checkpoint['acc'] except FileNotFoundError as err: # This error happens when folds are present # If interrupted on fold 1 then best best_checkpoint for fold 2 does # not exists. This is fixed like this. logging_process.info(f'Model: {args.model_dir}\tError: {err}') # TRAINING LOOP for epoch in range(epoch, num_epochs + 1): print(f'Epoch {epoch}/{num_epochs}') # To track values in each epoch tloss, tacc, vloss, vacc = '', '', '', '' tloss0, tacc0, vloss0, vacc0 = '', '', '', '' tloss1, tacc2, vloss3, vacc4 = '', '', '', '' # Each epoch has a training phase and a validation phase for phase in ['train', 'val']: if phase == 'train': net.train() # Set net to training mode # Track learning rate for plot mylr_value = optimizer.param_groups[0]['lr'] logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}') else: net.eval() # Set net to evaluate mode # Track statistics running_loss0 = 0.0 running_loss1 = 0.0 running_corrects0 = 0 running_corrects1 = 0 # Iterate over data for index, inputs, bins_labels, cats_labels in tqdm( dataloaders[phase]): inputs = inputs.to(device) bins_labels = bins_labels.to(device) cats_labels = cats_labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward # Track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs_bins, outputs_cats = net(inputs) #outputs_bins = torch.reshape(outputs_bins, (-1,)) # reshape added for binary outputs_bins = outputs_bins.to(device) outputs_cats = outputs_cats.to(device) #loss0 = criterion[0](outputs_bins, bins_labels.float())# float added for binary loss0 = criterion[0](outputs_bins, bins_labels) loss1 = criterion[1](outputs_cats, cats_labels) loss0 = loss0 * (2 / 307) loss1 = loss1 * (305 / 307) #loss0 = loss0 * (2/306) #loss1 = loss1 * (304/306) # Backward + optimize only if in training phase if phase == 'train': loss = (loss0 + loss1) / 2 loss.backward() optimizer.step() # Batch statistics running_loss0 += loss0.detach().item() * inputs.size(0) running_loss1 += loss1.detach().item() * inputs.size(0) #running_corrects0 += torch.sum(torch.round(outputs_bins) == bins_labels.data) running_corrects0 += torch.sum( torch.max(outputs_bins, 1)[1] == bins_labels.data) running_corrects1 += torch.sum( torch.max(outputs_cats, 1)[1] == cats_labels.data) # efficientnetb #if net_name.startswith('efficientnetb'): # if phase == 'train': # scheduler.step() # inceptionv #if net_name.startswith('inceptionv'): # if phase == 'train': # if (epoch % 2) == 0: # scheduler.step() # Epoch statistics epoch_loss0 = running_loss0 / dataset_sizes[phase] epoch_loss1 = running_loss1 / dataset_sizes[phase] epoch_loss = epoch_loss0 + epoch_loss1 epoch_acc0 = (running_corrects0.double() / dataset_sizes[phase]) * (2 / 307) epoch_acc1 = (running_corrects1.double() / dataset_sizes[phase]) * (305 / 307) epoch_acc = (epoch_acc0 + epoch_acc1) / 2 #print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) #logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) print('{} bin_loss: {:.4f} bin_acc: {:.4f}'.format( phase, epoch_loss0, epoch_acc0)) #logging_bins.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss0, epoch_acc0)) print('{} cat_loss: {:.4f} cat_acc: {:.4f}'.format( phase, epoch_loss1, epoch_acc1)) #logging_cats.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss1, epoch_acc1)) if phase == 'train': tloss = epoch_loss tloss0 = epoch_loss0 tloss1 = epoch_loss1 tacc = epoch_acc tacc0 = epoch_acc0 tacc1 = epoch_acc1 if phase == 'val': vloss = epoch_loss vloss0 = epoch_loss0 vloss1 = epoch_loss1 vacc = epoch_acc vacc0 = epoch_acc0 vacc1 = epoch_acc1 logging_train.info( 'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}' .format(epoch, tloss, tacc, vloss, vacc)) logging_bins.info( 'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}' .format(epoch, tloss0, tacc0, vloss0, vacc0)) logging_cats.info( 'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}' .format(epoch, tloss1, tacc1, vloss1, vacc1)) # Save last settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss }, last_path) if epoch_acc > best_acc: best_acc = epoch_acc #best_net_wts = net.state_dict() # Save best settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), #best_net_wts 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, 'acc': best_acc }, best_path) # Save best settings to .json file best_metrics = { f'loss{fold}': epoch_loss, f'acc{fold}': best_acc.item() } fname = os.path.join(args.model_dir, f'metrics{fold}.json') with open(fname, 'w') as f: f.write(json.dumps(best_metrics)) #vgg if net_name.startswith('vgg'): scheduler.step(epoch_acc) # resnet #if net_name.startswith('resnet'): # scheduler.step(epoch_loss) print('Best val Acc: {:4f}'.format(best_acc)) logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format( args.model_dir, fold, best_acc))
def get_loaders(dfs, mean, std, size, batch_size, num_workers): """ Function that takes a dictionary of dataframes and returns 2 dictionaries of pytorch dataloaders and dataset_sizes """ # Reproducibility myutils.myseed(seed=42) # Custom pytorch dataloader for this dataset class Derm(Dataset): """ Read a pandas dataframe with images paths and labels """ def __init__(self, df, transform=None): self.df = df self.transform = transform def __len__(self): return len(self.df) def __getitem__(self, index): try: # Load image data and get label X = Image.open(self.df['filenames'][index]).convert('RGB') y = torch.tensor(self.df.iloc[index,2:]) except IOError as err: pass if self.transform: X = self.transform(X) # Sanity check #print('id:', self.df['id'][index], 'label', y) return index, X, y # Transforms data_transforms = {'train' : transforms.Compose([transforms.Resize(size), transforms.CenterCrop((size,size)), transforms.ToTensor(), transforms.Normalize(mean,std)]), 'val' : transforms.Compose([transforms.Resize(size), transforms.CenterCrop((size,size)), transforms.ToTensor(), transforms.Normalize(mean,std)]), 'test' : transforms.Compose([transforms.Resize(size), transforms.CenterCrop((size,size)), transforms.ToTensor(), transforms.Normalize(mean,std)]), 'unknown' : transforms.Compose([transforms.Resize(size), transforms.CenterCrop((size,size)), transforms.ToTensor(), transforms.Normalize(mean,std)])} # Sets image_datasets = {x: Derm(dfs[x], transform=data_transforms[x]) for x in dfs.keys()} # Sizes dataset_sizes = {x: len(image_datasets[x]) for x in dfs.keys()} # Loaders dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers, pin_memory=False) for x in dfs.keys()} return dataloaders, dataset_sizes
def data_split(data_dir, folds): """ Function that takes a data_dir and a number of folds, and splits images in data_dir into training(80%) and testing(20%) data. For fit.py training data is further splitted into training and validation sets. If cross validation is needed, training data is also splitted into train and validation folds. """ # Reproducibility myutils.myseed(seed=42) seed = 42 # Load the data with image paths and labels df = pd.read_csv(f'{data_dir}.csv') df = df.sample(frac=1, random_state=seed).reset_index(drop=True) # Test train_val, test = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True) train_val, test = train_val.reset_index(drop=True), test.reset_index( drop=True) test.to_csv(os.path.join(data_dir, 'test.csv'), index=False) logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}") # Train and validation train, val = train_test_split(train_val, test_size=0.2, random_state=seed, shuffle=True) train, val = train_val.reset_index(drop=True), val.reset_index(drop=True) train.to_csv(os.path.join(data_dir, 'train.csv'), index=False) val.to_csv(os.path.join(data_dir, 'val.csv'), index=False) logging_data_process.info(f"Saved: {os.path.join(data_dir, 'train.csv')}") logging_data_process.info(f"Saved: {os.path.join(data_dir, 'val.csv')}") # Cross validation folds if folds > 1: logging_data_process.info(f'Folds: {folds}') X = train_val[['image_path']] y = train_val[['label_code']] skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True) fold = 0 for train_idx, val_idx in skf.split(X, y): fold += 1 train_idx, val_idx = list(train_idx), list(val_idx) train, val = train_val.iloc[train_idx, :], train_val.iloc[ val_idx, :] train, val = train.reset_index(drop=True), val.reset_index( drop=True) train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False) val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False) logging_data_process.info( f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}") logging_data_process.info( f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, num_epochs): """ Train and evaluate a net. """ # Initialize logs fname = os.path.join(args.model_dir, f'train{fold}.log') logging_train = myutils.setup_logger(fname) # Reproducibility myutils.myseed(seed=42) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load initial weights net = net.to(device) best_net_wts = copy.deepcopy(net.state_dict()) best_acc, epoch = 0.0, 1 # Initialize .tar files to save settings fname = f'last{fold}.tar' last_path = os.path.join(args.model_dir, fname) fname = f'best{fold}.tar' best_path = os.path.join(args.model_dir, fname) # To resume training for more epochs if args.resume: try: # Load last settings from .tar file last_checkpoint = torch.load(last_path) net.load_state_dict(last_checkpoint['net_state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer_state_dict']) epoch = last_checkpoint[ 'epoch'] + 1 # Since last epoch was saved we start with the next one logging_process.info( f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}' ) # Load best settings from .tar file best_checkpoint = torch.load(best_path) best_net_wts = best_checkpoint['net_state_dict'] best_acc = best_checkpoint['acc'] except FileNotFoundError as err: # This error happens when folds are present # If interrupted on fold 1 then best best_checkpoint for fold 2 does # not exists. This is fixed like this. logging_process.info(f'Model: {args.model_dir}\tError: {err}') # Initialize early stop settings best_val_loss, epochs_no_improve, patience = np.Inf, 0, 5 # TRAINING LOOP for epoch in range(epoch, num_epochs + 1): # Early stop if epochs_no_improve == patience: print('Early stop') logging_process.info( f'Model: {args.model_dir}\tFold:{fold}\tEarly stop: {epoch}') break print(f'Epoch {epoch}/{num_epochs}') logging_train.info(f'Epoch {epoch}/{num_epochs}') # Each epoch has a training phase and validation phase for phase in ['train', 'val']: if phase == 'train': net.train() # Set net to training mode else: net.eval() # Set net to evaluate mode # Track statistics running_loss = 0.0 running_corrects = 0 # Iterate over data for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward # Track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = net(inputs) probs, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # Backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # Batch statistics running_loss += loss.item() * inputs.size( 0) # This is batch loss running_corrects += torch.sum( preds == labels.data) # This is batch accuracy if phase == 'train': scheduler.step() # Epoch statistics epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) if phase == 'val': # Best loss tracking for early stop if epoch_loss < best_val_loss: best_val_loss = epoch_loss epochs_no_improve = 0 else: epochs_no_improve += 1 # Save last settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss }, last_path) if epoch_acc > best_acc: best_acc = epoch_acc best_net_wts = net.state_dict() # Save best settings to .tar file torch.save( { 'epoch': epoch, 'net_state_dict': best_net_wts, 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, 'acc': best_acc }, best_path) # Save best settings to .json file best_metrics = { f'loss{fold}': epoch_loss, f'acc{fold}': best_acc.item() } fname = os.path.join(args.model_dir, f'metrics{fold}.json') with open(fname, 'w') as f: f.write(json.dumps(best_metrics)) print('Best val Acc: {:4f}'.format(best_acc)) logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format( args.model_dir, fold, best_acc))
def load_data(data_dir): """ Function that takes a folder, finds all .jpg files inside the folder, and creates a dataframe. """ # Reproducibility myutils.myseed(seed=42) # Get the image paths filenames = myutils.run_fast_scandir(data_dir, [".jpg"]) # BINARY dfa = pd.DataFrame(data=filenames, columns=['filenames']) # Get the label from nth folder starting from the parent: outlevel = 4 # fname = '/scratch/s181423_data/data_bin/label/image.jpg' dfa['label0'] = dfa['filenames'].apply(lambda x: x.split('/')[outlevel]) # Get the id from the basename dfa['id'] = dfa['filenames'].apply(lambda x: os.path.basename(x)) # Get label as one hot encoded values dfa = dfa.set_index(['id', 'filenames']) #dfa['label0'] = dfa['label0'].astype('category') # Create a subset of skin to get the disease categories only for these pictures df_skin_only = dfa[dfa['label0'] == 'skin'] # CATEGORIES dfb = pd.DataFrame(data=filenames, columns=['filenames']) # Get the label from nth folder starting from the parent: outlevel = 5 # fname = '/scratch/s181423_data/data_bin/label/image.jpg' dfb['label1'] = dfb['filenames'].apply(lambda x: x.split('/')[outlevel]) # Get the id from the basename dfb['id'] = dfb['filenames'].apply(lambda x: os.path.basename(x)) # Get label as one hot encoded values dfb = dfb.set_index(['id', 'filenames']) #dfb['label1'] = dfb['label1'].astype('category') # Get disease categories only for the skin images df_diseases = pd.concat([df_skin_only, dfb], axis=1, sort=False, join='inner').drop(['label0'], axis=1) # Join binary and categories labels df = pd.concat([dfa, df_diseases], axis=1, sort=False) df = df.fillna('AAA') df['label0'] = df['label0'].astype('category') df['label1'] = df['label1'].astype('category') mapping = {} mapping_binary = dict(enumerate(df['label0'].cat.categories)) mapping_categories = dict(enumerate(df['label1'].cat.categories)) df['label0'] = pd.Categorical(df['label0']).codes df['label1'] = pd.Categorical(df['label1']).codes mapping['mapping_binary'] = mapping_binary mapping['mapping_categories'] = mapping_categories #df = pd.get_dummies(df, prefix='', prefix_sep='') df = df.reset_index() # Resample the minority classes to have them in training and testing counts_df = pd.DataFrame(df['label1'].value_counts()) labels_with_one_example = list(counts_df[counts_df['label1'] < 2].index) duplicates_df = df[df['label1'].isin(labels_with_one_example)] # 5-plicate df for stratify 20% of 5 is 1, for 80% train, 20% test df_copy = duplicates_df df = pd.concat([df, df_copy, df_copy, df_copy, df_copy]) # Save the data as a .csv file df.to_csv(f'{data_dir}.csv', index=False) logging_data_process.info(f'Saved: {data_dir}.csv') # Save the mappings as a .json file with open('dicts/mapping.json', 'w') as f: f.write(json.dumps(mapping)) logging_data_process.info('Saved: dicts/mapping.json')
def data_split(data_dir, folds): """ Function that takes a data_dir and a number of folds, and splits images in data_dir into training(80%) and testing(20%) data. For fit.py training data is further splitted into training and validation sets. If cross validation is needed, training data is also splitted into train and validation folds. """ def rep_sample(df, col, n, *args, **kwargs): nu = df[col].nunique() m = len(df) mpb = n // nu mku = n - mpb * nu fills = np.zeros(nu) fills[:mku] = 1 sample_sizes = (np.ones(nu) * mpb + fills).astype(int) gb = df.groupby(col) sample = lambda sub_df, i: sub_df.sample( sample_sizes[i], *args, **kwargs, replace=True) subs = [sample(sub_df, i) for i, (_, sub_df) in enumerate(gb)] return pd.concat(subs) # Reproducibility myutils.myseed(seed=42) seed = 42 # Load the data with image paths and labels df = pd.read_csv(f'{data_dir}.csv') #df = df.sample(frac=1, random_state=seed).reset_index(drop=True) logging_data_process.info(f"all data size:{len(df)}") # Test y = list(df['label1']) train_val, test = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True, stratify=y) train_val, test = train_val.reset_index(drop=True), test.reset_index( drop=True) # Remove samples used in training from test.csv ids = list(train_val.id) test = test[~test.id.isin(ids)] test print(f'test:{len(test.label1.value_counts())}') test.to_csv(os.path.join(data_dir, 'test.csv'), index=False) logging_data_process.info(f"test size:{len(test)}") logging_data_process.info(f"train_val size:{len(train_val)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}") if folds == 1: logging_data_process.info(f'Folds: {folds}') fold = 1 # Train and validation y = list(train_val['label1']) #X = train_val[['id','filenames']] #y = train_val.iloc[:,2:].apply(lambda x: np.argmax(x), axis=1) # argmax is necessary for stratification # categories did not allow stratify because some classes have just 1 example train, val = train_test_split(train_val, test_size=0.2, random_state=seed, shuffle=True, stratify=y) train, val = train.reset_index(drop=True), val.reset_index(drop=True) size = len(train['label1'].unique()) * 1000 print(f'size: {size}') train = rep_sample(train, 'label1', size) train = sklearn.utils.shuffle(train) print(f'train{fold}:{len(train.label1.value_counts())}') print(f'val{fold}:{len(val.label1.value_counts())}') train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False) val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False) logging_data_process.info(f"train{fold} size:{len(train)}") logging_data_process.info( f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}") logging_data_process.info(f"val{fold} size:{len(val)}") logging_data_process.info( f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}") # Cross validation folds if folds > 1: print('WARNING: Karen has not implemented this yet!') '''
def data_split(data_dir, folds): """ Function that takes a data_dir and a number of folds, and splits images in data_dir into training(80%) and testing(20%) data. If cross validation is needed, training data is also splitted into train and validation folds. """ def rep_sample(df, col, n, *args, **kwargs): nu = df[col].nunique() m = len(df) mpb = n // nu mku = n - mpb * nu fills = np.zeros(nu) fills[:mku] = 1 sample_sizes = (np.ones(nu) * mpb + fills).astype(int) gb = df.groupby(col) sample = lambda sub_df, i: sub_df.sample(sample_sizes[i], *args, **kwargs, replace=True) subs = [sample(sub_df, i) for i, (_, sub_df) in enumerate(gb)] return pd.concat(subs) # Reproducibility myutils.myseed(seed=42) seed = 42 # Load the data with image paths and labels df = pd.read_csv(f'{data_dir}.csv') logging_data_process.info(f"all data size:{len(df)}") # Test y = list(df['label']) train_val, test = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True, stratify=y) train_val, test = train_val.reset_index(drop=True), test.reset_index(drop=True) # Remove samples used in training from test.csv ids = list(train_val.id) test = test[~test.id.isin(ids)] test print(f'test:{len(test.label.value_counts())}') test.to_csv(os.path.join(data_dir, 'test.csv'), index=False) logging_data_process.info(f"test size:{len(test)}") logging_data_process.info(f"train_val size:{len(train_val)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}") # Cross validation folds if folds == 1: logging_data_process.info(f'Folds: {folds}') X = train_val[['id','filenames']] y = list(train_val['label']) n_splits = 2 # Put 2 and break when fold 1 finishes #skf = StratifiedKFold(n_splits, random_state=seed, shuffle=True) skf = StratifiedShuffleSplit(n_splits, random_state=seed, test_size=0.2) fold = 0 for train_idx, val_idx in skf.split(X, y): fold += 1 train_idx, val_idx = list(train_idx), list(val_idx) train, val = train_val.iloc[train_idx,:], train_val.iloc[val_idx,:] train, val = train.reset_index(drop=True), val.reset_index(drop=True) # To overfit to the first balanced batch #size = 304 #train = rep_sample(train, 'label', size) #train =sklearn.utils.shuffle(train) print(f'train{fold}:{len(train.label.value_counts())}') train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False) val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False) logging_data_process.info(f"train{fold} size:{len(train)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}") logging_data_process.info(f"val{fold} size:{len(val)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}") break if folds > 1: logging_data_process.info(f'Folds: {folds}') X = train_val[['id','filenames']] y = list(train_val['label']) #skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True) skf = StratifiedShuffleSplit(n_splits=folds, random_state=seed, test_size=0.2) fold = 0 for train_idx, val_idx in skf.split(X, y): fold += 1 train_idx, val_idx = list(train_idx), list(val_idx) train, val = train_val.iloc[train_idx,:], train_val.iloc[val_idx,:] train, val = train.reset_index(drop=True), val.reset_index(drop=True) # To overfit to the first balanced batch #size = 304 #train = rep_sample(train, 'label', size) #train =sklearn.utils.shuffle(train) print(f'train{fold}:{len(train.label.value_counts())}') train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False) val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False) logging_data_process.info(f"train{fold} size:{len(train)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}") logging_data_process.info(f"val{fold} size:{len(val)}") logging_data_process.info(f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")