def __init__(self, input_dir, patch_info_file, patch_size, annotations, input_dimension, class_label_map=None, identify=None, img_transform=None, anno_transform=None): if bb is None: raise ImportError( 'Brambox needs to be installed to use this dataset') super().__init__(input_dimension) self.annos = annotations self.annos['ignore'] = 0 self.annos['class_label'] = self.annos['class_label'].astype(int) #-1 print(self.annos['class_label'].unique()) #print(self.annos.shape) self.keys = self.annos.image.cat.categories # stores unique patches #print(self.keys) self.img_tf = img_transform self.anno_tf = anno_transform self.patch_info = load_sql_df(patch_info_file, patch_size) IDs = self.patch_info['ID'].unique() self.slides = { slide: da.from_zarr(join(input_dir, '{}.zarr'.format(slide))) for slide in IDs } self.id = lambda k: k.split('/') # experiment #self.annos['x_top_left'], self.annos['y_top_left']=self.annos['y_top_left'], self.annos['x_top_left'] self.annos['width'], self.annos['height'] = self.annos[ 'height'], self.annos['width'] # Add class_ids if class_label_map is None: log.warning( f'No class_label_map given, generating it by sorting unique class labels from data alphabetically, which is not always deterministic behaviour' ) class_label_map = list(np.sort(self.annos.class_label.unique())) self.annos['class_id'] = self.annos.class_label.map( dict((l, i) for i, l in enumerate(class_label_map)))
patch_size = args.patch_size p_sample = args.p_sample np.random.seed(42) annotation_file = 'annotations_bbox_{}.pkl'.format(patch_size) reference_mask = args.reference_mask if not os.path.exists('widths.pkl'): m = np.load(reference_mask) bbox_df = get_boxes(m) official_widths = dict( bbox_df.groupby('class_label')['width'].mean() + 2 * bbox_df.groupby('class_label')['width'].std()) pickle.dump(official_widths, open('widths.pkl', 'wb')) else: official_widths = pickle.load(open('widths.pkl', 'rb')) patch_info = load_sql_df(patch_info_file, patch_size) IDs = patch_info['ID'].unique() #slides = {slide:da.from_zarr(join(input_dir,'{}.zarr'.format(slide))) for slide in IDs} masks = { mask: npy2da(join(input_dir, '{}_mask.npy'.format(mask))) for mask in IDs } if p_sample < 1.: patch_info = patch_info.sample(frac=p_sample) if not os.path.exists(annotation_file): bbox_df = bb.util.new('annotation').drop( columns=['difficult', 'ignore', 'lost', 'occluded', 'truncated'])[[ 'image', 'class_label', 'x_top_left', 'y_top_left', 'width', 'height'
def train_model(inputs_dir='inputs_training', learning_rate=1e-4, n_epochs=300, crop_size=224, resize=256, mean=[0.5, 0.5, 0.5], std=[0.1, 0.1, 0.1], num_classes=2, architecture='resnet50', batch_size=32, predict=False, model_save_loc='saved_model.pkl', pretrained_save_loc='pretrained_model.pkl', predictions_save_path='predictions.pkl', predict_set='test', verbose=False, class_balance=True, extract_embeddings="", extract_embeddings_df="", embedding_out_dir="./", gpu_id=-1, checkpoints_dir="checkpoints", tensor_dataset=False, pickle_dataset=False, label_map=dict(), semantic_segmentation=False, save_metric="loss", custom_dataset=None, save_predictions=True, pretrained=False, save_after_n_batch=0, include_test_set=False, use_npy_rotate=False, sample_frac=1., sample_every=0, num_workers=0, npy_rotate_sets_pkl="" ): assert save_metric in ['loss','f1'] if use_npy_rotate: tensor_dataset,pickle_dataset=False,False else: sample_every=0 if predict: include_test_set=True if predict: assert not use_npy_rotate if extract_embeddings: assert predict, "Must be in prediction mode to extract embeddings" if tensor_dataset: assert not pickle_dataset, "Cannot have pickle and tensor classes activated" if semantic_segmentation and custom_dataset is None: assert tensor_dataset==True, "For now, can only perform semantic segmentation with TensorDataset" if gpu_id>=0: torch.cuda.set_device(gpu_id) transformers=generate_transformers if not tensor_dataset else generate_kornia_transforms if semantic_segmentation: transformers=generate_kornia_segmentation_transforms transformers = transformers( image_size=crop_size, resize=resize, mean=mean, std=std) if custom_dataset is not None: assert predict datasets={} datasets['custom']=custom_dataset predict_set='custom' else: if tensor_dataset: datasets = {x: torch.load(os.path.join(inputs_dir,f"{x}_data.pth")) for x in (['train','val']+(['test'] if include_test_set else [])) if os.path.exists(os.path.join(inputs_dir,f"{x}_data.pth"))} for k in datasets: if len(datasets[k].tensors[1].shape)>1 and not semantic_segmentation: datasets[k]=TensorDataset(datasets[k].tensors[0],datasets[k].tensors[1].flatten()) elif pickle_dataset: datasets = {x: PickleDataset(os.path.join(inputs_dir,f"{x}_data.pkl"),transformers[x],label_map) for x in (['train','val']+(['test'] if include_test_set else [])) if os.path.exists(os.path.join(inputs_dir,f"{x}_data.pkl"))} elif use_npy_rotate: datasets = {x: NPYRotatingStack(os.path.join(inputs_dir,x),transformers[x],(sample_frac if x=='train' else 1.),sample_every,label_map,npy_rotate_sets_pkl,x) for x in (['train','val']+(['test'] if include_test_set else []))} else: datasets = {x: Datasets.ImageFolder(os.path.join( inputs_dir, x), transformers[x]) for x in (['train','val']+(['test'] if include_test_set else []))} if verbose: print(datasets) dataloaders = {x: DataLoader( datasets[x], batch_size=batch_size, num_workers=num_workers, shuffle=(x == 'train' and not predict), worker_init_fn=worker_init_fn) for x in datasets} model = generate_model(architecture, num_classes, semantic_segmentation=semantic_segmentation, pretrained=pretrained, n_aux_features=None if semantic_segmentation or "n_aux_features" not in dir(datasets.get('train',datasets.get('custom',None))) else datasets.get('train',datasets.get('custom',None)).n_aux_features) if verbose: print(model) if torch.cuda.is_available(): model = model.cuda() optimizer_opts = dict(name='adam', lr=learning_rate, weight_decay=1e-4) scheduler_opts = dict(scheduler='warm_restarts', lr_scheduler_decay=0.5, T_max=10, eta_min=5e-8, T_mult=2) trainer = ModelTrainer(model, n_epochs, None if predict else dataloaders['val'], optimizer_opts, scheduler_opts, loss_fn='dice' if (semantic_segmentation and not class_balance) else 'ce', checkpoints_dir=checkpoints_dir, tensor_dataset=tensor_dataset, transforms=transformers, semantic_segmentation=semantic_segmentation, save_metric=save_metric, save_after_n_batch=save_after_n_batch) if os.path.exists(pretrained_save_loc): trainer.model.load_state_dict(torch.load(pretrained_save_loc,map_location=f"cuda:{gpu_id}" if gpu_id>=0 else "cpu")) if not predict: if class_balance: trainer.add_class_balance_loss(datasets['train'].targets if not tensor_dataset else datasets['train'].tensors[1].numpy().flatten()) trainer, min_val_loss_f1, best_epoch=trainer.fit(dataloaders['train'],verbose=verbose) torch.save(trainer.model.state_dict(), model_save_loc) return trainer.model else: # assert not tensor_dataset, "Only ImageFolder and NPYDatasets allowed" if os.path.exists(model_save_loc): trainer.model.load_state_dict(torch.load(model_save_loc,map_location=f"cuda:{gpu_id}" if gpu_id>=0 else "cpu")) if extract_embeddings: assert not semantic_segmentation, "Semantic Segmentation not implemented for whole slide segmentation" trainer.model=nn.Sequential(trainer.model.features,Reshape())#,trainer.model.output if predict_set=='custom': dataset=datasets['custom'] assert 'embed' in dir(dataset), "Embedding method required for dataset with model input, batch size and embedding output directory as arguments." else: assert len(extract_embeddings_df)>0 and os.path.exists(extract_embeddings_df), "Must load data from SQL database or pickle if not using custom dataset" if extract_embeddings_df.endswith(".db"): from pathflowai.utils import load_sql_df patch_info=load_sql_df(extract_embeddings_df,resize) elif extract_embeddings_df.endswith(".pkl"): patch_info=pd.read_pickle(extract_embeddings_df) assert patch_info['patch_size'].iloc[0]==resize, "Patch size pickle does not match." else: raise NotImplementedError dataset=NPYDataset(patch_info,extract_embeddings,transformers["test"],tensor_dataset) return dataset.embed(trainer.model,batch_size,embedding_out_dir) # return "Output Embeddings" else: Y = dict() Y['pred'],Y['true'] = trainer.predict(dataloaders[predict_set]) # Y['model'] = trainer.model # Y['true'] = datasets[predict_set].targets if save_predictions: torch.save(Y, predictions_save_path) return Y
def train_model(inputs_dir='inputs_training', learning_rate=1e-4, n_epochs=300, crop_size=224, resize=256, mean=[0.5, 0.5, 0.5], std=[0.1, 0.1, 0.1], num_classes=2, architecture='resnet50', batch_size=32, predict=False, model_save_loc='saved_model.pkl', predictions_save_path='predictions.pkl', predict_set='test', verbose=False, class_balance=True, extract_embeddings="", extract_embeddings_df="", embedding_out_dir="./", gpu_id=0, checkpoints_dir="checkpoints", tensor_dataset=False): if extract_embeddings: assert predict, "Must be in prediction mode to extract embeddings" torch.cuda.set_device(gpu_id) transformers = generate_transformers if not tensor_dataset else generate_kornia_transforms transformers = transformers(image_size=crop_size, resize=resize, mean=mean, std=std) if not extract_embeddings: if tensor_dataset: datasets = { x: torch.load(os.path.join(inputs_dir, f"{x}_data.pth")) for x in ['train', 'val'] } else: datasets = { x: Datasets.ImageFolder(os.path.join(inputs_dir, x), transformers[x]) for x in ['train', 'val', 'test'] } dataloaders = { x: DataLoader(datasets[x], batch_size=batch_size, shuffle=(x == 'train')) for x in datasets } model = generate_model(architecture, num_classes) if torch.cuda.is_available(): model = model.cuda() optimizer_opts = dict(name='adam', lr=learning_rate, weight_decay=1e-4) scheduler_opts = dict(scheduler='warm_restarts', lr_scheduler_decay=0.5, T_max=10, eta_min=5e-8, T_mult=2) trainer = ModelTrainer(model, n_epochs, None if predict else dataloaders['val'], optimizer_opts, scheduler_opts, loss_fn='ce', checkpoints_dir=checkpoints_dir, tensor_dataset=tensor_dataset, transforms=transformers) if not predict: if class_balance: trainer.add_class_balance_loss( datasets['train'].targets if not tensor_dataset else datasets['train'].tensors[1].numpy()) trainer, min_val_loss, best_epoch = trainer.fit(dataloaders['train'], verbose=verbose) torch.save(trainer.model.state_dict(), model_save_loc) else: assert not tensor_dataset, "Only ImageFolder and NPYDatasets allowed" trainer.model.load_state_dict(torch.load(model_save_loc)) if extract_embeddings and extract_embeddings_df: trainer.model = nn.Sequential(trainer.model.features, Reshape()) patch_info = load_sql_df(extract_embeddings_df, resize) dataset = NPYDataset(patch_info, extract_embeddings, transformers["test"]) dataset.embed(trainer.model, batch_size, embedding_out_dir) exit() Y = dict() Y['pred'], Y['true'] = trainer.predict(dataloaders[predict_set]) # Y['true'] = datasets[predict_set].targets torch.save(Y, predictions_save_path)