Python create_dataset Examples, sidekick.create_dataset Python Examples

Example #1

0

Show file

def test_import_multiple_formats(tmpdir):
    size = (64, 32)
    images = [
        Image.new(mode='RGBA', size=size),
        Image.new(mode='LA', size=size),
        Image.new(mode='RGB', size=size),
        Image.new(mode='L', size=size)
    ]

    df = pd.DataFrame({
        'image_column': images
    })
    dataset_path = str(tmpdir.join('dataset.zip'))
    set_image_format = functools.partial(
        sidekick.process_image, file_format='png')

    sidekick.create_dataset(
        dataset_path,
        df,
        preprocess={
            'image_column': set_image_format
        },
        progress=True,
        parallel_processing=0
    )
    assert os.path.exists(dataset_path) and os.path.getsize(dataset_path) > 100

Example #2

0

Show file

File: create_train_data.py Project: thongnguyen495/community-code

    def create_data_set(self):
        df_all = pd.read_csv("train_curated.csv")
        df_all['fname'] = self.data_dir + df_all['fname']
        labels = df_all['labels'].str.get_dummies(',').sort_index(axis=1)
        df_all = pd.concat((df_all, labels), axis=1)
        df_test = df_all.sample(n=100)
        df_train = df_all.drop(df_test.index)
        df_list = [(df_train, 'train'), (df_test, 'test')]

        for df, set in df_list:
            if set == 'train':
                data = list()
                for row in df.to_dict('records'):
                    for j in range(self.nbr_of_random_crops):
                        try:
                            img = self.waw_to_image(row['fname'])
                            row['image'] = img
                            data.append(row)
                        except FileNotFoundError:
                            print("An exception occurred")

                df_new = pd.DataFrame(data)
                df_new = df_new.astype({label: np.int64 for label in labels})

                # Create dataset
                set_image_format = functools.partial(sidekick.process_image,
                                                     file_format='png')

                sidekick.create_dataset(os.path.join(
                    self.zip_dir, self.data_name + 'dataset.zip'),
                                        df_new,
                                        preprocess={'image': set_image_format},
                                        progress=True)

Example #3

0

Show file

File: boss.py Project: siwS/ai-ni-hackathon

    def build_dataframe(self, noisy, clean):
        li = []
        for f in os.listdir(noisy):
            if os.path.isfile(f"{clean}/{f}"):
                li.append({"noisy": f"{noisy}/{f}", "clean": f"{clean}/{f}"})
        df = pd.DataFrame(li, columns=["noisy", "clean"])

        sidekick.create_dataset('files/output.zip',
                                df,
                                path_columns=["noisy", "clean"],
                                progress=True)

Example #4

0

Show file

File: test_dataset.py Project: kemiz/sidekick

def test_dataset_metadata(dataset_index, tmpdir):
    # Create dataset
    dataset_path = str(tmpdir.join('dataset.zip'))
    set_image_format = functools.partial(sidekick.process_image,
                                         file_format='png')
    sidekick.create_dataset(dataset_path,
                            dataset_index,
                            preprocess={'image_column': set_image_format})

    # Assert the .sidekick metadata was added
    with zipfile.ZipFile(dataset_path, 'r') as zf:
        assert '.sidekick' in zf.namelist()

Example #5

0

Show file

def test_dataset_metadata(dataset_index, tmpdir):
    # Create dataset
    dataset_path = str(tmpdir.join('dataset.zip'))
    set_image_format = functools.partial(
        sidekick.process_image, file_format='png')
    sidekick.create_dataset(
        dataset_path,
        dataset_index,
        preprocess={'image_column': set_image_format}
    )

    # Assert that the metadata file was added
    with zipfile.ZipFile(dataset_path, 'r') as zf:
        metadata = zf.read('metadata.json')
        assert metadata == b'{ "source" : "sidekick" }'

Example #6

0

Show file

File: test_dataset.py Project: kemiz/sidekick

def test_create_dataset_parallel(dataset_index, tmpdir):
    # Create dataset
    dataset_path = str(tmpdir.join('dataset.zip'))
    resize_image = functools.partial(sidekick.process_image,
                                     mode='resize',
                                     size=(32, 8),
                                     file_format='png')
    set_image_format = functools.partial(sidekick.process_image,
                                         file_format='png')

    sidekick.create_dataset(
        dataset_path,
        dataset_index,
        path_columns=['image_file_column', 'image_file_process_column'],
        preprocess={
            'image_file_process_column': resize_image,
            'image_column': set_image_format
        },
        progress=False,
        parallel_processing=10)
    assert os.path.exists(dataset_path) and os.path.getsize(dataset_path) > 100

Example #7

0

Show file

def create_ham_dataset(
        directory: str = None,
        size: Tuple[int, int] = (224, 224),
        split: float = 0.8,
        balance: bool = True,
) -> None:
    """Creates a Peltarion-compatible zip file with the HAM10000 dataset.

    The HAM10000 dataset contains labeled images of different types of skin
    lesions. Read more here: https://arxiv.org/abs/1803.10417.

    All data is provided under the terms of the Creative Commons
    Attribution-NonCommercial (CC BY-NC) 4.0 license. You may find the terms of
    the licence here: https://creativecommons.org/licenses/by-nc/4.0/legalcode.
    If you are unable to accept the terms of this license, do not download or
    use this data.

    Please notice that the disclaimer in the README.md applies.

    Args:
        directory: Directory where the dataset will be stored. If not provided,
            it defaults to the current working directory.
        size: Image size after resizing: (width, height). The original image
            size is (600, 450).
        split: Split fraction between training and validation.
        balance: Balance training dataset by oversampling.
    """""

    images_dir = 'ISIC2018_Task3_Training_Input'
    metadata_dir = 'ISIC2018_Task3_Training_GroundTruth'
    metadata_file = 'ISIC2018_Task3_Training_GroundTruth.csv'

    metadata_url = 'https://challenge.kitware.com/api/v1/item/' \
                   '5ac20eeb56357d4ff856e136/download'
    images_url = 'https://challenge.kitware.com/api/v1/item/' \
                 '5ac20fc456357d4ff856e139/download'

    if directory is None:
        directory = os.getcwd()

    if not os.path.isdir(directory):
        sys.exit('Directory provided does not exist')

    dataset_path = os.path.join(directory, 'ham_dataset.zip')
    with tempfile.TemporaryDirectory() as tmpdir:

        print('Downloading metadata...')
        metadata, _ = urlretrieve(metadata_url)
        with ZipFile(metadata) as zip_handle:
            zip_handle.extractall(tmpdir)

        print('Downloading images. This step might take some time.')
        images, _ = urlretrieve(images_url)
        with ZipFile(images) as zip_handle:
            zip_handle.extractall(tmpdir)

        # read metadata
        df = pd.read_csv(os.path.join(tmpdir, metadata_dir, metadata_file))

        # decode one-hot encoding
        categories = ['MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC']
        df['target'] = df[categories].idxmax(axis=1)
        df = df.drop(categories, axis=1)

        # split dataset into train and validation
        rows = df.shape[0]
        subset = ['train' if random() < split else 'val' for _ in range(rows)]
        df['subset'] = subset

        if balance:
            train = df[df['subset'] == 'train']
            val = df[df['subset'] == 'val']
            train_balanced = balance_dataset(train, 'target')
            df = pd.concat([train_balanced, val], ignore_index=True)

        # replace image name by image path
        df['image'] = df['image'].apply(
            lambda x: os.path.join(tmpdir, images_dir, x + '.jpg'))

        image_processor = functools.partial(
            sidekick.process_image,
            mode='resize',
            size=size,
            file_format='jpeg'
        )

        print('Creating dataset...')
        sidekick.create_dataset(
            dataset_path,
            df,
            path_columns=['image'],
            preprocess={'image': image_processor},
            progress=True,
        )