Esempio n. 1
0
def gen_train_patches(input_folder: Path, output_folder: Path,
                      num_train_per_class: int, num_workers: int,
                      patch_size: int, purple_threshold: int,
                      purple_scale_size: int, image_ext: str,
                      type_histopath: bool) -> None:
    """
    Generates all patches for subfolders in the training set.

    Args:
        input_folder: Folder containing the subfolders containing WSI.
        output_folder: Folder to save the patches to.
        num_train_per_class: The desired number of training patches per class.
        num_workers: Number of workers to use for IO.
        patch_size: Size of the patches extracted from the WSI.
        purple_threshold: Number of purple points for region to be considered purple.
        purple_scale_size: Scalar to use for reducing image to check for purple.
        image_ext: Image extension for saving patches.
        type_histopath: Only look for purple histopathology images and filter whitespace.
    """
    # Find the subfolders and how much patches should overlap for each.
    subfolders = get_subfolder_paths(folder=input_folder)
    print(f"{subfolders} subfolders found from {input_folder}")
    subfolder_to_overlap_factor = get_subfolder_to_overlap(
        subfolders=subfolders, desired_crops_per_class=num_train_per_class)

    # Produce the patches.
    for input_subfolder in subfolders:
        produce_patches(input_folder=input_subfolder,
                        output_folder=output_folder.joinpath(
                            input_subfolder.name),
                        inverse_overlap_factor=subfolder_to_overlap_factor[
                            input_subfolder],
                        by_folder=False,
                        num_workers=num_workers,
                        patch_size=patch_size,
                        purple_threshold=purple_threshold,
                        purple_scale_size=purple_scale_size,
                        image_ext=image_ext,
                        type_histopath=type_histopath)

    print("\nfinished all folders\n")
Esempio n. 2
0
def balance_classes(training_folder: Path) -> None:
    """
    Balancing class distribution so that training isn't skewed.

    Args:
        training_folder: Folder containing the subfolders to be balanced.
    """
    subfolders = get_subfolder_paths(folder=training_folder)
    subfolder_to_images = {
        subfolder: get_image_paths(folder=subfolder)
        for subfolder in subfolders
    }

    # Find the class with the most images.
    biggest_size = max({
        subfolder: len(subfolder_to_images[subfolder])
        for subfolder in subfolders
    }.values())

    for subfolder in subfolder_to_images:
        duplicate_until_n(image_paths=subfolder_to_images[subfolder],
                          n=biggest_size)

    print(f"balanced all training classes to have {biggest_size} images\n")
Esempio n. 3
0
def get_predictions(patches_eval_folder: Path, output_folder: Path,
                    checkpoints_folder: Path, auto_select: bool,
                    eval_model: Path, device: torch.device, classes: List[str],
                    num_classes: int, path_mean: List[float],
                    path_std: List[float], num_layers: int, pretrain: bool,
                    batch_size: int, num_workers: int) -> None:
    """
    Main function for running the model on all of the generated patches.
    Args:
        patches_eval_folder: Folder containing patches to evaluate on.
        output_folder: Folder to save the model results to.
        checkpoints_folder: Directory to save model checkpoints to.
        auto_select: Automatically select the model with the highest validation accuracy,
        eval_model: Path to the model with the highest validation accuracy.
        device: Device to use for running model.
        classes: Names of the classes in the dataset.
        num_classes: Number of classes in the dataset.
        path_mean: Means of the WSIs for each dimension.
        path_std: Standard deviations of the WSIs for each dimension.
        num_layers: Number of layers to use in the ResNet model from [18, 34, 50, 101, 152].
        pretrain: Use pretrained ResNet weights.
        batch_size: Mini-batch size to use for training.
        num_workers: Number of workers to use for IO.
    """
    # Initialize the model.
    model_path = get_best_model(
        checkpoints_folder=checkpoints_folder) if auto_select else eval_model

    model = create_model(num_classes=num_classes,
                         num_layers=num_layers,
                         pretrain=pretrain)
    ckpt = torch.load(f=model_path)
    model.load_state_dict(state_dict=ckpt["model_state_dict"])
    model = model.to(device=device)

    model.train(mode=False)
    print(f"model loaded from {model_path}")

    # For outputting the predictions.
    class_num_to_class = {i: classes[i] for i in range(num_classes)}

    start = time.time()
    # Load the data for each folder.
    image_folders = get_subfolder_paths(folder=patches_eval_folder)

    # Where we want to write out the predictions.
    # Confirm the output directory exists.
    output_folder.mkdir(parents=True, exist_ok=True)

    # For each WSI.
    for image_folder in image_folders:

        # Temporary fix. Need not to make folders with no crops.
        try:
            # Load the image dataset.
            dataloader = torch.utils.data.DataLoader(
                dataset=datasets.ImageFolder(
                    root=str(image_folder),
                    transform=transforms.Compose(transforms=[
                        transforms.Resize((224, 224)),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=path_mean, std=path_std)
                    ])),
                batch_size=batch_size,
                shuffle=False,
                num_workers=num_workers)
        except RuntimeError:
            print(
                "WARNING: One of the image directories is empty. Skipping this directory."
            )
            continue

        num_test_image_windows = len(dataloader) * batch_size

        # Load the image names so we know the coordinates of the patches we are predicting.
        image_folder = image_folder.joinpath(image_folder.name)
        window_names = get_image_paths(folder=image_folder)

        print(f"testing on {num_test_image_windows} crops from {image_folder}")

        test_label_to_class = {0:"0", 1:"180", 2:"270", 3:"90"}

        with output_folder.joinpath(f"{image_folder.name}.csv").open(
                mode="w") as writer:

            writer.write("image_name,ground_truth,prediction,confidence\n")

            # Loop through all of the patches.
            for batch_num, (test_inputs, test_labels) in enumerate(dataloader):
                batch_window_names = window_names[batch_num *
                                                  batch_size:batch_num *
                                                  batch_size + batch_size]

                confidences, test_preds = torch.max(nn.Softmax(dim=1)(model(
                    test_inputs.to(device=device))),
                                                    dim=1)
                for i in range(test_preds.shape[0]):
                    # Find coordinates and predicted class.
                    image_name = batch_window_names[i].name
                    # xy = batch_window_names[i].name.split(".")[0].split(";")

                    writer.write(
                        f"{','.join([image_name, image_folder.name, f'{class_num_to_class[test_preds[i].data.item()]}', f'{confidences[i].data.item():.5f}'])}\n"
                    )

    print(f"time for {patches_eval_folder}: {time.time() - start:.2f} seconds")
Esempio n. 4
0
def split(all_wsi, train_folder, val_folder, test_folder, val_split,
          test_split, keep_orig_copy, labels_train, labels_val, labels_test):

    head = 'cp' if keep_orig_copy else 'mv'  # based on whether we want to move or keep the files

    # create folders
    for folder in [train_folder, val_folder, test_folder]:
        subfolders = [join(folder, _class) for _class in config.classes]
        for subfolder in subfolders:
            confirm_output_folder(subfolder)

    train_img_to_label = {}
    val_img_to_label = {}
    test_img_to_label = {}

    def move_set(folder, image_files, ops):
        """
        Return:
            a dictionary where
                key is (str)image_file_name and
                value is (str)image_class
        """
        def remove_topdir(filepath):
            """filepath should be a relative path
            ex) a/b/c.jpg -> b/c.jpg
            """
            first_delimiter_idx = filepath.find('/')
            return filepath[first_delimiter_idx + 1:]

        img_to_label = {}
        for image_file in image_files:
            output_path = join(folder, remove_topdir(image_file))
            os.system(f'{ops} {image_file} {output_path}')
            img_name = basename(image_file)
            img_class = basename(dirname(image_file))
            img_to_label[img_name] = img_class
        return img_to_label

    # sort the images and move/copy them appropriately
    subfolder_paths = get_subfolder_paths(all_wsi)
    for subfolder in subfolder_paths:

        image_paths = get_image_paths(subfolder)
        assert len(image_paths) > val_split + test_split
        # make sure we have enough slides in each class

        # assign training, test, and val images
        test_idx = len(image_paths) - test_split
        val_idx = test_idx - val_split
        train_images = image_paths[:val_idx]
        val_images = image_paths[val_idx:test_idx]
        test_images = image_paths[test_idx:]
        print('class {}:'.format(basename(subfolder)),
              '#train={}'.format(len(train_images)),
              '#val={} '.format(len(val_images)),
              '#test={}'.format(len(test_images)))

        # move train
        tmp_train_img_to_label = move_set(folder=train_folder,
                                          image_files=train_images,
                                          ops=head)
        train_img_to_label.update(tmp_train_img_to_label)

        # move val
        tmp_val_img_to_label = move_set(folder=val_folder,
                                        image_files=val_images,
                                        ops=head)
        val_img_to_label.update(tmp_train_img_to_label)

        # move test
        tmp_test_img_to_label = move_set(folder=test_folder,
                                         image_files=test_images,
                                         ops=head)

    # for making the csv files
    def write_to_csv(dest_filename, image_lable_dict):
        with open(dest_filename, 'w') as writer:
            writer.write('img,gt\n')
            for img in sorted(image_lable_dict.keys()):
                writer.write(img + ',' + image_lable_dict[img] + '\n')

    write_to_csv(dest_filename=labels_train,
                 image_lable_dict=train_img_to_label)
    write_to_csv(dest_filename=labels_val,
                 image_lable_dict=val_img_to_label)
    write_to_csv(dest_filename=labels_test,
                 image_lable_dict=test_img_to_label)
Esempio n. 5
0
def split(keep_orig_copy: bool, wsi_train: Path, wsi_val: Path, wsi_test: Path,
          classes: List[str], all_wsi: Path, val_wsi_per_class: int,
          test_wsi_per_class: int, labels_train: Path, labels_test: Path,
          labels_val: Path) -> None:
    """
    Main function for splitting data. Note that we want the
    validation and test sets to be balanced.

    Args:
        keep_orig_copy: Whether to move or copy the WSI when splitting into training, validation, and test sets.
        wsi_train: Location to be created to store WSI for training.
        wsi_val: Location to be created to store WSI for validation.
        wsi_test: Location to be created to store WSI for testing.
        classes: Names of the classes in the dataset.
        all_wsi: Location of the WSI organized in subfolders by class.
        val_wsi_per_class: Number of WSI per class to use in the validation set.
        test_wsi_per_class: Number of WSI per class to use in the test set.
        labels_train: Location to store the CSV file labels for training.
        labels_test: Location to store the CSV file labels for testing.
        labels_val: Location to store the CSV file labels for validation.
    """
    # Based on whether we want to move or keep the files.
    head = shutil.copyfile if keep_orig_copy else shutil.move

    # Create folders.
    for f in (wsi_train, wsi_val, wsi_test):
        subfolders = [f.joinpath(_class) for _class in classes]

        for subfolder in subfolders:
            # Confirm the output directory exists.
            subfolder.mkdir(parents=True, exist_ok=True)

    train_img_to_label = {}
    val_img_to_label = {}
    test_img_to_label = {}

    def move_set(folder: Path, image_files: List[Path],
                 ops: shutil) -> Dict[Path, str]:
        """
        Moves the sets to the desired output directories.

        Args:
            folder: Folder to move images to.
            image_files: Image files to move.
            ops: Whether to move or copy the files.

        Return:
            A dictionary mapping image filenames to classes.
        """
        def remove_topdir(filepath: Path) -> Path:
            """
            Remove the top directory since the filepath needs to be
            a relative path (i.e., a/b/c.jpg -> b/c.jpg).

            Args:
                filepath: Path to remove top directory from.

            Returns:
                Path with top directory removed.
            """
            return Path(*filepath.parts[1:])

        img_to_label = {}
        for image_file in image_files:
            # Copy or move the files.
            ops(src=image_file,
                dst=folder.joinpath(remove_topdir(filepath=image_file)))

            img_to_label[Path(image_file.name)] = image_file.parent.name

        return img_to_label

    # Sort the images and move/copy them appropriately.
    subfolder_paths = get_subfolder_paths(folder=all_wsi)
    for subfolder in subfolder_paths:
        image_paths = get_image_paths(folder=subfolder)

        # Make sure we have enough slides in each class.
        assert len(
            image_paths
        ) > val_wsi_per_class + test_wsi_per_class, "Not enough slides in each class."

        # Assign training, test, and validation images.
        test_idx = len(image_paths) - test_wsi_per_class
        val_idx = test_idx - val_wsi_per_class
        train_images = image_paths[:val_idx]
        val_images = image_paths[val_idx:test_idx]
        test_images = image_paths[test_idx:]
        print(f"class {Path(subfolder).name} "
              f"#train={len(train_images)} "
              f"#val={len(val_images)} "
              f"#test={len(test_images)}")

        # Move the training images.
        train_img_to_label.update(
            move_set(folder=wsi_train, image_files=train_images, ops=head))

        # Move the validation images.
        val_img_to_label.update(
            move_set(folder=wsi_val, image_files=val_images, ops=head))

        # Move the testing images.
        test_img_to_label.update(
            move_set(folder=wsi_test, image_files=test_images, ops=head))

    def write_to_csv(dest_filename: Path,
                     image_label_dict: Dict[Path, str]) -> None:
        """
        Write the image names and corresponding labels to a CSV file.

        Args:
            dest_filename: Destination filename for the CSV file.
            image_label_dict: Dictionary mapping filenames to labels.
        """
        with dest_filename.open(mode="w") as writer:
            writer.write("img,gt\n")
            for img in sorted(image_label_dict.keys()):
                writer.write(f"{img},{image_label_dict[img]}\n")

    write_to_csv(dest_filename=labels_train,
                 image_label_dict=train_img_to_label)
    write_to_csv(dest_filename=labels_val, image_label_dict=val_img_to_label)
    write_to_csv(dest_filename=labels_test, image_label_dict=test_img_to_label)