def test_import_multiple_formats(tmpdir): size = (64, 32) images = [ Image.new(mode='RGBA', size=size), Image.new(mode='LA', size=size), Image.new(mode='RGB', size=size), Image.new(mode='L', size=size) ] df = pd.DataFrame({ 'image_column': images }) dataset_path = str(tmpdir.join('dataset.zip')) set_image_format = functools.partial( sidekick.process_image, file_format='png') sidekick.create_dataset( dataset_path, df, preprocess={ 'image_column': set_image_format }, progress=True, parallel_processing=0 ) assert os.path.exists(dataset_path) and os.path.getsize(dataset_path) > 100
def create_data_set(self): df_all = pd.read_csv("train_curated.csv") df_all['fname'] = self.data_dir + df_all['fname'] labels = df_all['labels'].str.get_dummies(',').sort_index(axis=1) df_all = pd.concat((df_all, labels), axis=1) df_test = df_all.sample(n=100) df_train = df_all.drop(df_test.index) df_list = [(df_train, 'train'), (df_test, 'test')] for df, set in df_list: if set == 'train': data = list() for row in df.to_dict('records'): for j in range(self.nbr_of_random_crops): try: img = self.waw_to_image(row['fname']) row['image'] = img data.append(row) except FileNotFoundError: print("An exception occurred") df_new = pd.DataFrame(data) df_new = df_new.astype({label: np.int64 for label in labels}) # Create dataset set_image_format = functools.partial(sidekick.process_image, file_format='png') sidekick.create_dataset(os.path.join( self.zip_dir, self.data_name + 'dataset.zip'), df_new, preprocess={'image': set_image_format}, progress=True)
def build_dataframe(self, noisy, clean): li = [] for f in os.listdir(noisy): if os.path.isfile(f"{clean}/{f}"): li.append({"noisy": f"{noisy}/{f}", "clean": f"{clean}/{f}"}) df = pd.DataFrame(li, columns=["noisy", "clean"]) sidekick.create_dataset('files/output.zip', df, path_columns=["noisy", "clean"], progress=True)
def test_dataset_metadata(dataset_index, tmpdir): # Create dataset dataset_path = str(tmpdir.join('dataset.zip')) set_image_format = functools.partial(sidekick.process_image, file_format='png') sidekick.create_dataset(dataset_path, dataset_index, preprocess={'image_column': set_image_format}) # Assert the .sidekick metadata was added with zipfile.ZipFile(dataset_path, 'r') as zf: assert '.sidekick' in zf.namelist()
def test_dataset_metadata(dataset_index, tmpdir): # Create dataset dataset_path = str(tmpdir.join('dataset.zip')) set_image_format = functools.partial( sidekick.process_image, file_format='png') sidekick.create_dataset( dataset_path, dataset_index, preprocess={'image_column': set_image_format} ) # Assert that the metadata file was added with zipfile.ZipFile(dataset_path, 'r') as zf: metadata = zf.read('metadata.json') assert metadata == b'{ "source" : "sidekick" }'
def test_create_dataset_parallel(dataset_index, tmpdir): # Create dataset dataset_path = str(tmpdir.join('dataset.zip')) resize_image = functools.partial(sidekick.process_image, mode='resize', size=(32, 8), file_format='png') set_image_format = functools.partial(sidekick.process_image, file_format='png') sidekick.create_dataset( dataset_path, dataset_index, path_columns=['image_file_column', 'image_file_process_column'], preprocess={ 'image_file_process_column': resize_image, 'image_column': set_image_format }, progress=False, parallel_processing=10) assert os.path.exists(dataset_path) and os.path.getsize(dataset_path) > 100
def create_ham_dataset( directory: str = None, size: Tuple[int, int] = (224, 224), split: float = 0.8, balance: bool = True, ) -> None: """Creates a Peltarion-compatible zip file with the HAM10000 dataset. The HAM10000 dataset contains labeled images of different types of skin lesions. Read more here: https://arxiv.org/abs/1803.10417. All data is provided under the terms of the Creative Commons Attribution-NonCommercial (CC BY-NC) 4.0 license. You may find the terms of the licence here: https://creativecommons.org/licenses/by-nc/4.0/legalcode. If you are unable to accept the terms of this license, do not download or use this data. Please notice that the disclaimer in the README.md applies. Args: directory: Directory where the dataset will be stored. If not provided, it defaults to the current working directory. size: Image size after resizing: (width, height). The original image size is (600, 450). split: Split fraction between training and validation. balance: Balance training dataset by oversampling. """"" images_dir = 'ISIC2018_Task3_Training_Input' metadata_dir = 'ISIC2018_Task3_Training_GroundTruth' metadata_file = 'ISIC2018_Task3_Training_GroundTruth.csv' metadata_url = 'https://challenge.kitware.com/api/v1/item/' \ '5ac20eeb56357d4ff856e136/download' images_url = 'https://challenge.kitware.com/api/v1/item/' \ '5ac20fc456357d4ff856e139/download' if directory is None: directory = os.getcwd() if not os.path.isdir(directory): sys.exit('Directory provided does not exist') dataset_path = os.path.join(directory, 'ham_dataset.zip') with tempfile.TemporaryDirectory() as tmpdir: print('Downloading metadata...') metadata, _ = urlretrieve(metadata_url) with ZipFile(metadata) as zip_handle: zip_handle.extractall(tmpdir) print('Downloading images. This step might take some time.') images, _ = urlretrieve(images_url) with ZipFile(images) as zip_handle: zip_handle.extractall(tmpdir) # read metadata df = pd.read_csv(os.path.join(tmpdir, metadata_dir, metadata_file)) # decode one-hot encoding categories = ['MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC'] df['target'] = df[categories].idxmax(axis=1) df = df.drop(categories, axis=1) # split dataset into train and validation rows = df.shape[0] subset = ['train' if random() < split else 'val' for _ in range(rows)] df['subset'] = subset if balance: train = df[df['subset'] == 'train'] val = df[df['subset'] == 'val'] train_balanced = balance_dataset(train, 'target') df = pd.concat([train_balanced, val], ignore_index=True) # replace image name by image path df['image'] = df['image'].apply( lambda x: os.path.join(tmpdir, images_dir, x + '.jpg')) image_processor = functools.partial( sidekick.process_image, mode='resize', size=size, file_format='jpeg' ) print('Creating dataset...') sidekick.create_dataset( dataset_path, df, path_columns=['image'], preprocess={'image': image_processor}, progress=True, )