Example #1
0
    def __init__(self, *args, **kwargs):
        """
        * Initializes the object
        * Cleans the train, validation and test output folders (performs delete and create operations)
        """
        self.image_list = None
        self.split_files = [
            settings.TRAIN_SPLIT_FILENAME, settings.VALID_SPLIT_FILENAME,
            settings.TEST_SPLIT_FILENAME
        ]
        self.folder_names = [
            settings.TRAIN_FOLDER_NAME, settings.VALID_FOLDER_NAME,
            settings.TEST_FOLDER_NAME
        ]

        for folder in self.folder_names:
            clean_create_folder(os.path.join(settings.OUTPUT_FOLDER, folder))
Example #2
0
    def __process_splits(self):
        """
        Processes the provided dataset splits and saves in CSV format to be consumed by the
        application
        """
        clean_create_folder(settings.OUTPUT_FOLDER)

        print("Formatting sub-datasets")
        for filename, sub_dataset in tqdm(
                list(zip(self.csv_output_filenames, self.SUB_DATASETS))):
            data = np.genfromtxt(os.path.join(settings.TRAIN_PHOTOS_DATASET,
                                              '{}.csv'.format(sub_dataset)),
                                 delimiter=',',
                                 dtype=np.str)
            formatted_csv_path = os.path.join(settings.OUTPUT_FOLDER, filename)
            with open(formatted_csv_path, 'w') as file_:
                json.dump(data.tolist(), file_)
Example #3
0
    def __create_json_files(self):
        """ Saves train, validation and test datasets into JSON files """
        print("Saving train/validation/test dataset into JSON files...")

        file_paths = [
            os.path.join(settings.OUTPUT_FOLDER, filename) for filename in [
                settings.TRAIN_SPLIT_FILENAME, settings.VALID_SPLIT_FILENAME,
                settings.TEST_SPLIT_FILENAME
            ]
        ]

        clean_create_folder(settings.OUTPUT_FOLDER)

        for file_path, data in tqdm(
                zip(file_paths, (self.train_xy, self.val_xy, self.test_xy))):
            with open(file_path, 'w') as file_:
                # Workaround to save numpy array without errors
                json.dump(data.tolist(), file_)
Example #4
0
    def __process(self):
        """
        Creates transformed images and saves them in a directory at the same level of the
        dataset directory
        """
        scaled_path = os.path.join(
            Path(settings.TRAIN_PHOTOS_DATASET).parent,
            '{}_{}'.format(os.path.basename(settings.TRAIN_PHOTOS_DATASET),
                           self.scale))
        remove_folder(scaled_path)

        # creating new images
        for folder in os.listdir(settings.TRAIN_PHOTOS_DATASET):
            current_folder = os.path.join(settings.TRAIN_PHOTOS_DATASET,
                                          folder)

            if Path(current_folder).is_dir():
                new_folder = os.path.join(scaled_path, folder)
                clean_create_folder(new_folder)
                print('Creating new images from directory: {}'.format(folder))

                for image_name in tqdm(
                        list(
                            filter(lambda x: x.endswith(self.image_extension),
                                   os.listdir(current_folder)))):
                    image = plt.imread(os.path.join(current_folder,
                                                    image_name))
                    rescaled_img = self.transform(image, self.scale,
                                                  **self.transform_kwargs)

                    pil_img = Image.fromarray(
                        (rescaled_img * 255 / np.max(rescaled_img)).astype(
                            np.uint8))
                    pil_img.save(os.path.join(new_folder, image_name))

        # copyting CSV file
        shutil.copyfile(
            settings.TRAIN_PHOTOS_GROUND_TRUTH,
            os.path.join(scaled_path,
                         os.path.basename(settings.TRAIN_PHOTOS_GROUND_TRUTH)))
Example #5
0
    def create_datasets_for_LC_KSVD(self, filename):
        """
        Args:
            filename (str): filename with .json extension

        Usage:
            model.create_datasets_for_LC_KSVD('my_dataset.json')
        """
        clean_create_folder(self.codes_folder)
        cleaned_filename = clean_json_filename(filename)
        name, extension = get_filename_and_extension(cleaned_filename)

        print("Formatting and saving sub-datasets codes for LC-KSVD")
        for dataset in self.sub_datasets:
            print("Processing image's batches from sub-dataset: {}".format(
                dataset))
            new_name = '{}_{}.{}'.format(name, dataset, extension)
            formatted_data = {'codes': [], 'labels': []}
            self.process_data(dataset, formatted_data)
            self.format_for_LC_KSVD(formatted_data)

            with open(os.path.join(self.codes_folder, new_name), 'w') as file_:
                json.dump(formatted_data, file_)
Example #6
0
    def __process(self):
        """ Process PatchCamelyon dataset and creates PNG images plus CSV files """
        normal_counter = 0
        tumor_counter = 0
        ground_truth = defaultdict(list)

        clean_create_folder(self.tumor_folder_path)
        clean_create_folder(self.normal_folder_path)

        for sub_dataset in self.SUB_DATASETS:
            y_path = os.path.join(
                settings.BASE_DATASET_LINK,
                self.BASE_FILE_PATTERN.format(sub_dataset, 'y'))
            x_path = os.path.join(
                settings.BASE_DATASET_LINK,
                self.BASE_FILE_PATTERN.format(sub_dataset, 'x'))

            with h5py.File(y_path, 'r') as y_file:
                with h5py.File(x_path, 'r') as x_file:
                    x_data = x_file['x']
                    y_data = y_file['y']

                    print(
                        "Creating PNG and CSV files for {} sub-dataset".format(
                            sub_dataset))
                    for i in tqdm(range(y_data.shape[0])):
                        if y_data[i][0][0][0] == PCamLabel.TUMOR.id:
                            tumor_counter += 1
                            filename = 't{}.png'.format(tumor_counter)
                            saving_path = os.path.join(self.tumor_folder_path,
                                                       filename)
                            label = PCamLabel.TUMOR.name
                        else:
                            normal_counter += 1
                            filename = 'n{}.png'.format(normal_counter)
                            saving_path = os.path.join(self.normal_folder_path,
                                                       filename)
                            label = PCamLabel.NORMAL.name

                        if self.only_center:
                            plt_image = Image.fromarray(
                                (x_data[i][32:64, 32:64, :]).astype('uint8'))
                        else:
                            plt_image = Image.fromarray(
                                (x_data[i]).astype('uint8'))

                        plt_image.save(saving_path)
                        ground_truth[sub_dataset].append([filename, label])

                    self.__write_list_to_csv(ground_truth[sub_dataset],
                                             '{}.csv'.format(sub_dataset))

        full_ground_truth = []
        print("Creating full ground truth CSV file")
        with tqdm(total=2) as pbar:
            for values in ground_truth.values():
                full_ground_truth.extend(values)
            pbar.update(1)
            self.__write_list_to_csv(full_ground_truth,
                                     'full_ground_truth.csv')
            pbar.update(1)