Esempio n. 1
0
    def __init__(self,
                 input_dir,
                 patch_info_file,
                 patch_size,
                 annotations,
                 input_dimension,
                 class_label_map=None,
                 identify=None,
                 img_transform=None,
                 anno_transform=None):
        if bb is None:
            raise ImportError(
                'Brambox needs to be installed to use this dataset')
        super().__init__(input_dimension)

        self.annos = annotations
        self.annos['ignore'] = 0
        self.annos['class_label'] = self.annos['class_label'].astype(int)  #-1
        print(self.annos['class_label'].unique())
        #print(self.annos.shape)
        self.keys = self.annos.image.cat.categories  # stores unique patches
        #print(self.keys)
        self.img_tf = img_transform
        self.anno_tf = anno_transform
        self.patch_info = load_sql_df(patch_info_file, patch_size)
        IDs = self.patch_info['ID'].unique()
        self.slides = {
            slide: da.from_zarr(join(input_dir, '{}.zarr'.format(slide)))
            for slide in IDs
        }
        self.id = lambda k: k.split('/')
        # experiment
        #self.annos['x_top_left'], self.annos['y_top_left']=self.annos['y_top_left'], self.annos['x_top_left']
        self.annos['width'], self.annos['height'] = self.annos[
            'height'], self.annos['width']
        # Add class_ids
        if class_label_map is None:
            log.warning(
                f'No class_label_map given, generating it by sorting unique class labels from data alphabetically, which is not always deterministic behaviour'
            )
            class_label_map = list(np.sort(self.annos.class_label.unique()))
        self.annos['class_id'] = self.annos.class_label.map(
            dict((l, i) for i, l in enumerate(class_label_map)))
Esempio n. 2
0
    patch_size = args.patch_size
    p_sample = args.p_sample
    np.random.seed(42)
    annotation_file = 'annotations_bbox_{}.pkl'.format(patch_size)
    reference_mask = args.reference_mask
    if not os.path.exists('widths.pkl'):
        m = np.load(reference_mask)
        bbox_df = get_boxes(m)
        official_widths = dict(
            bbox_df.groupby('class_label')['width'].mean() +
            2 * bbox_df.groupby('class_label')['width'].std())
        pickle.dump(official_widths, open('widths.pkl', 'wb'))
    else:
        official_widths = pickle.load(open('widths.pkl', 'rb'))

    patch_info = load_sql_df(patch_info_file, patch_size)
    IDs = patch_info['ID'].unique()
    #slides = {slide:da.from_zarr(join(input_dir,'{}.zarr'.format(slide))) for slide in IDs}
    masks = {
        mask: npy2da(join(input_dir, '{}_mask.npy'.format(mask)))
        for mask in IDs
    }

    if p_sample < 1.:
        patch_info = patch_info.sample(frac=p_sample)

    if not os.path.exists(annotation_file):
        bbox_df = bb.util.new('annotation').drop(
            columns=['difficult', 'ignore', 'lost', 'occluded', 'truncated'])[[
                'image', 'class_label', 'x_top_left', 'y_top_left', 'width',
                'height'
Esempio n. 3
0
def train_model(inputs_dir='inputs_training',
                learning_rate=1e-4,
                n_epochs=300,
                crop_size=224,
                resize=256,
                mean=[0.5, 0.5, 0.5],
                std=[0.1, 0.1, 0.1],
                num_classes=2,
                architecture='resnet50',
                batch_size=32,
                predict=False,
                model_save_loc='saved_model.pkl',
                pretrained_save_loc='pretrained_model.pkl',
                predictions_save_path='predictions.pkl',
                predict_set='test',
                verbose=False,
                class_balance=True,
                extract_embeddings="",
                extract_embeddings_df="",
                embedding_out_dir="./",
                gpu_id=-1,
                checkpoints_dir="checkpoints",
                tensor_dataset=False,
                pickle_dataset=False,
                label_map=dict(),
                semantic_segmentation=False,
                save_metric="loss",
                custom_dataset=None,
                save_predictions=True,
                pretrained=False,
                save_after_n_batch=0,
                include_test_set=False,
                use_npy_rotate=False,
                sample_frac=1.,
                sample_every=0,
                num_workers=0,
                npy_rotate_sets_pkl=""
                ):
    assert save_metric in ['loss','f1']
    if use_npy_rotate: tensor_dataset,pickle_dataset=False,False
    else: sample_every=0
    if predict: include_test_set=True
    if predict: assert not use_npy_rotate
    if extract_embeddings: assert predict, "Must be in prediction mode to extract embeddings"
    if tensor_dataset: assert not pickle_dataset, "Cannot have pickle and tensor classes activated"
    if semantic_segmentation and custom_dataset is None: assert tensor_dataset==True, "For now, can only perform semantic segmentation with TensorDataset"
    if gpu_id>=0: torch.cuda.set_device(gpu_id)
    transformers=generate_transformers if not tensor_dataset else generate_kornia_transforms
    if semantic_segmentation: transformers=generate_kornia_segmentation_transforms
    transformers = transformers(
        image_size=crop_size, resize=resize, mean=mean, std=std)
    if custom_dataset is not None:
        assert predict
        datasets={}
        datasets['custom']=custom_dataset
        predict_set='custom'
    else:
        if tensor_dataset:
            datasets = {x: torch.load(os.path.join(inputs_dir,f"{x}_data.pth")) for x in (['train','val']+(['test'] if include_test_set else [])) if os.path.exists(os.path.join(inputs_dir,f"{x}_data.pth"))}
            for k in datasets:
                if len(datasets[k].tensors[1].shape)>1 and not semantic_segmentation: datasets[k]=TensorDataset(datasets[k].tensors[0],datasets[k].tensors[1].flatten())
        elif pickle_dataset:
            datasets = {x: PickleDataset(os.path.join(inputs_dir,f"{x}_data.pkl"),transformers[x],label_map) for x in (['train','val']+(['test'] if include_test_set else [])) if os.path.exists(os.path.join(inputs_dir,f"{x}_data.pkl"))}
        elif use_npy_rotate:
            datasets = {x: NPYRotatingStack(os.path.join(inputs_dir,x),transformers[x],(sample_frac if x=='train' else 1.),sample_every,label_map,npy_rotate_sets_pkl,x) for x in (['train','val']+(['test'] if include_test_set else []))}
        else:
            datasets = {x: Datasets.ImageFolder(os.path.join(
                inputs_dir, x), transformers[x]) for x in (['train','val']+(['test'] if include_test_set else []))}

    if verbose: print(datasets)

    dataloaders = {x: DataLoader(
        datasets[x], batch_size=batch_size, num_workers=num_workers, shuffle=(x == 'train' and not predict), worker_init_fn=worker_init_fn) for x in datasets}

    model = generate_model(architecture,
                           num_classes,
                           semantic_segmentation=semantic_segmentation,
                           pretrained=pretrained,
                           n_aux_features=None if semantic_segmentation or "n_aux_features" not in dir(datasets.get('train',datasets.get('custom',None))) else datasets.get('train',datasets.get('custom',None)).n_aux_features)

    if verbose: print(model)

    if torch.cuda.is_available():
        model = model.cuda()

    optimizer_opts = dict(name='adam',
                          lr=learning_rate,
                          weight_decay=1e-4)

    scheduler_opts = dict(scheduler='warm_restarts',
                          lr_scheduler_decay=0.5,
                          T_max=10,
                          eta_min=5e-8,
                          T_mult=2)

    trainer = ModelTrainer(model,
                           n_epochs,
                           None if predict else dataloaders['val'],
                           optimizer_opts,
                           scheduler_opts,
                           loss_fn='dice' if (semantic_segmentation and not class_balance) else 'ce',
                           checkpoints_dir=checkpoints_dir,
                           tensor_dataset=tensor_dataset,
                           transforms=transformers,
                           semantic_segmentation=semantic_segmentation,
                           save_metric=save_metric,
                           save_after_n_batch=save_after_n_batch)

    if os.path.exists(pretrained_save_loc):
        trainer.model.load_state_dict(torch.load(pretrained_save_loc,map_location=f"cuda:{gpu_id}" if gpu_id>=0 else "cpu"))

    if not predict:

        if class_balance:
            trainer.add_class_balance_loss(datasets['train'].targets if not tensor_dataset else datasets['train'].tensors[1].numpy().flatten())

        trainer, min_val_loss_f1, best_epoch=trainer.fit(dataloaders['train'],verbose=verbose)

        torch.save(trainer.model.state_dict(), model_save_loc)

        return trainer.model

    else:
        # assert not tensor_dataset, "Only ImageFolder and NPYDatasets allowed"

        if os.path.exists(model_save_loc):
            trainer.model.load_state_dict(torch.load(model_save_loc,map_location=f"cuda:{gpu_id}" if gpu_id>=0 else "cpu"))

        if extract_embeddings:
            assert not semantic_segmentation, "Semantic Segmentation not implemented for whole slide segmentation"
            trainer.model=nn.Sequential(trainer.model.features,Reshape())#,trainer.model.output
            if predict_set=='custom':
                dataset=datasets['custom']
                assert 'embed' in dir(dataset), "Embedding method required for dataset with model input, batch size and embedding output directory as arguments."
            else:
                assert len(extract_embeddings_df)>0 and os.path.exists(extract_embeddings_df), "Must load data from SQL database or pickle if not using custom dataset"
                if extract_embeddings_df.endswith(".db"):
                    from pathflowai.utils import load_sql_df
                    patch_info=load_sql_df(extract_embeddings_df,resize)
                elif extract_embeddings_df.endswith(".pkl"):
                    patch_info=pd.read_pickle(extract_embeddings_df)
                    assert patch_info['patch_size'].iloc[0]==resize, "Patch size pickle does not match."
                else:
                    raise NotImplementedError
                dataset=NPYDataset(patch_info,extract_embeddings,transformers["test"],tensor_dataset)
            return dataset.embed(trainer.model,batch_size,embedding_out_dir)
            # return "Output Embeddings"
        else:
            Y = dict()

            Y['pred'],Y['true'] = trainer.predict(dataloaders[predict_set])

            # Y['model'] = trainer.model

            # Y['true'] = datasets[predict_set].targets

            if save_predictions: torch.save(Y, predictions_save_path)

            return Y
Esempio n. 4
0
def train_model(inputs_dir='inputs_training',
                learning_rate=1e-4,
                n_epochs=300,
                crop_size=224,
                resize=256,
                mean=[0.5, 0.5, 0.5],
                std=[0.1, 0.1, 0.1],
                num_classes=2,
                architecture='resnet50',
                batch_size=32,
                predict=False,
                model_save_loc='saved_model.pkl',
                predictions_save_path='predictions.pkl',
                predict_set='test',
                verbose=False,
                class_balance=True,
                extract_embeddings="",
                extract_embeddings_df="",
                embedding_out_dir="./",
                gpu_id=0,
                checkpoints_dir="checkpoints",
                tensor_dataset=False):
    if extract_embeddings:
        assert predict, "Must be in prediction mode to extract embeddings"
    torch.cuda.set_device(gpu_id)
    transformers = generate_transformers if not tensor_dataset else generate_kornia_transforms
    transformers = transformers(image_size=crop_size,
                                resize=resize,
                                mean=mean,
                                std=std)
    if not extract_embeddings:
        if tensor_dataset:
            datasets = {
                x: torch.load(os.path.join(inputs_dir, f"{x}_data.pth"))
                for x in ['train', 'val']
            }
        else:
            datasets = {
                x: Datasets.ImageFolder(os.path.join(inputs_dir, x),
                                        transformers[x])
                for x in ['train', 'val', 'test']
            }

        dataloaders = {
            x: DataLoader(datasets[x],
                          batch_size=batch_size,
                          shuffle=(x == 'train'))
            for x in datasets
        }

    model = generate_model(architecture, num_classes)

    if torch.cuda.is_available():
        model = model.cuda()

    optimizer_opts = dict(name='adam', lr=learning_rate, weight_decay=1e-4)

    scheduler_opts = dict(scheduler='warm_restarts',
                          lr_scheduler_decay=0.5,
                          T_max=10,
                          eta_min=5e-8,
                          T_mult=2)

    trainer = ModelTrainer(model,
                           n_epochs,
                           None if predict else dataloaders['val'],
                           optimizer_opts,
                           scheduler_opts,
                           loss_fn='ce',
                           checkpoints_dir=checkpoints_dir,
                           tensor_dataset=tensor_dataset,
                           transforms=transformers)

    if not predict:

        if class_balance:
            trainer.add_class_balance_loss(
                datasets['train'].targets if not tensor_dataset else
                datasets['train'].tensors[1].numpy())

        trainer, min_val_loss, best_epoch = trainer.fit(dataloaders['train'],
                                                        verbose=verbose)

        torch.save(trainer.model.state_dict(), model_save_loc)

    else:
        assert not tensor_dataset, "Only ImageFolder and NPYDatasets allowed"

        trainer.model.load_state_dict(torch.load(model_save_loc))

        if extract_embeddings and extract_embeddings_df:
            trainer.model = nn.Sequential(trainer.model.features, Reshape())
            patch_info = load_sql_df(extract_embeddings_df, resize)
            dataset = NPYDataset(patch_info, extract_embeddings,
                                 transformers["test"])
            dataset.embed(trainer.model, batch_size, embedding_out_dir)
            exit()

        Y = dict()

        Y['pred'], Y['true'] = trainer.predict(dataloaders[predict_set])

        # Y['true'] = datasets[predict_set].targets

        torch.save(Y, predictions_save_path)