def __init__(self, *args, **kwargs): """ * Initializes the object * Cleans the train, validation and test output folders (performs delete and create operations) """ self.image_list = None self.split_files = [ settings.TRAIN_SPLIT_FILENAME, settings.VALID_SPLIT_FILENAME, settings.TEST_SPLIT_FILENAME ] self.folder_names = [ settings.TRAIN_FOLDER_NAME, settings.VALID_FOLDER_NAME, settings.TEST_FOLDER_NAME ] for folder in self.folder_names: clean_create_folder(os.path.join(settings.OUTPUT_FOLDER, folder))
def __process_splits(self): """ Processes the provided dataset splits and saves in CSV format to be consumed by the application """ clean_create_folder(settings.OUTPUT_FOLDER) print("Formatting sub-datasets") for filename, sub_dataset in tqdm( list(zip(self.csv_output_filenames, self.SUB_DATASETS))): data = np.genfromtxt(os.path.join(settings.TRAIN_PHOTOS_DATASET, '{}.csv'.format(sub_dataset)), delimiter=',', dtype=np.str) formatted_csv_path = os.path.join(settings.OUTPUT_FOLDER, filename) with open(formatted_csv_path, 'w') as file_: json.dump(data.tolist(), file_)
def __create_json_files(self): """ Saves train, validation and test datasets into JSON files """ print("Saving train/validation/test dataset into JSON files...") file_paths = [ os.path.join(settings.OUTPUT_FOLDER, filename) for filename in [ settings.TRAIN_SPLIT_FILENAME, settings.VALID_SPLIT_FILENAME, settings.TEST_SPLIT_FILENAME ] ] clean_create_folder(settings.OUTPUT_FOLDER) for file_path, data in tqdm( zip(file_paths, (self.train_xy, self.val_xy, self.test_xy))): with open(file_path, 'w') as file_: # Workaround to save numpy array without errors json.dump(data.tolist(), file_)
def __process(self): """ Creates transformed images and saves them in a directory at the same level of the dataset directory """ scaled_path = os.path.join( Path(settings.TRAIN_PHOTOS_DATASET).parent, '{}_{}'.format(os.path.basename(settings.TRAIN_PHOTOS_DATASET), self.scale)) remove_folder(scaled_path) # creating new images for folder in os.listdir(settings.TRAIN_PHOTOS_DATASET): current_folder = os.path.join(settings.TRAIN_PHOTOS_DATASET, folder) if Path(current_folder).is_dir(): new_folder = os.path.join(scaled_path, folder) clean_create_folder(new_folder) print('Creating new images from directory: {}'.format(folder)) for image_name in tqdm( list( filter(lambda x: x.endswith(self.image_extension), os.listdir(current_folder)))): image = plt.imread(os.path.join(current_folder, image_name)) rescaled_img = self.transform(image, self.scale, **self.transform_kwargs) pil_img = Image.fromarray( (rescaled_img * 255 / np.max(rescaled_img)).astype( np.uint8)) pil_img.save(os.path.join(new_folder, image_name)) # copyting CSV file shutil.copyfile( settings.TRAIN_PHOTOS_GROUND_TRUTH, os.path.join(scaled_path, os.path.basename(settings.TRAIN_PHOTOS_GROUND_TRUTH)))
def create_datasets_for_LC_KSVD(self, filename): """ Args: filename (str): filename with .json extension Usage: model.create_datasets_for_LC_KSVD('my_dataset.json') """ clean_create_folder(self.codes_folder) cleaned_filename = clean_json_filename(filename) name, extension = get_filename_and_extension(cleaned_filename) print("Formatting and saving sub-datasets codes for LC-KSVD") for dataset in self.sub_datasets: print("Processing image's batches from sub-dataset: {}".format( dataset)) new_name = '{}_{}.{}'.format(name, dataset, extension) formatted_data = {'codes': [], 'labels': []} self.process_data(dataset, formatted_data) self.format_for_LC_KSVD(formatted_data) with open(os.path.join(self.codes_folder, new_name), 'w') as file_: json.dump(formatted_data, file_)
def __process(self): """ Process PatchCamelyon dataset and creates PNG images plus CSV files """ normal_counter = 0 tumor_counter = 0 ground_truth = defaultdict(list) clean_create_folder(self.tumor_folder_path) clean_create_folder(self.normal_folder_path) for sub_dataset in self.SUB_DATASETS: y_path = os.path.join( settings.BASE_DATASET_LINK, self.BASE_FILE_PATTERN.format(sub_dataset, 'y')) x_path = os.path.join( settings.BASE_DATASET_LINK, self.BASE_FILE_PATTERN.format(sub_dataset, 'x')) with h5py.File(y_path, 'r') as y_file: with h5py.File(x_path, 'r') as x_file: x_data = x_file['x'] y_data = y_file['y'] print( "Creating PNG and CSV files for {} sub-dataset".format( sub_dataset)) for i in tqdm(range(y_data.shape[0])): if y_data[i][0][0][0] == PCamLabel.TUMOR.id: tumor_counter += 1 filename = 't{}.png'.format(tumor_counter) saving_path = os.path.join(self.tumor_folder_path, filename) label = PCamLabel.TUMOR.name else: normal_counter += 1 filename = 'n{}.png'.format(normal_counter) saving_path = os.path.join(self.normal_folder_path, filename) label = PCamLabel.NORMAL.name if self.only_center: plt_image = Image.fromarray( (x_data[i][32:64, 32:64, :]).astype('uint8')) else: plt_image = Image.fromarray( (x_data[i]).astype('uint8')) plt_image.save(saving_path) ground_truth[sub_dataset].append([filename, label]) self.__write_list_to_csv(ground_truth[sub_dataset], '{}.csv'.format(sub_dataset)) full_ground_truth = [] print("Creating full ground truth CSV file") with tqdm(total=2) as pbar: for values in ground_truth.values(): full_ground_truth.extend(values) pbar.update(1) self.__write_list_to_csv(full_ground_truth, 'full_ground_truth.csv') pbar.update(1)