Beispiel #1
0
 def _write_data_to_folder(arr, labels, folder):
     for i, (img, label) in enumerate(zip(arr, labels.detach().numpy())):
         dest = os.path.join(folder, str(label))
         make_folder_if_not_exists(dest)
         Image.fromarray(img.numpy(),
                         mode='L').save(os.path.join(dest,
                                                     str(i) + '.png'))
Beispiel #2
0
    def _write_data_to_folder(zipfile, filenames, labels, folder, start_index,
                              isTest):
        print("Writing data\n")
        sorted_labels = [None] * len(labels)
        if isTest == 1:
            for i in range(len(zipfile.infolist())):
                entry = zipfile.infolist()[i]
                if "IRHT_P_009793.tif" in entry.filename:
                    zipfile.infolist().remove(entry)
                    break

        zip_infolist = zipfile.infolist()[1:]

        for i in range(len(zip_infolist)):
            entry = zip_infolist[i]
            entry_index_infilenames = filenames.index(
                entry.filename[start_index:])
            sorted_labels[i] = labels[entry_index_infilenames]

        for i, (enrty,
                label) in enumerate(zip(zipfile.infolist()[1:],
                                        sorted_labels)):
            with zipfile.open(enrty) as file:
                img = Image.open(file)
                dest = os.path.join(folder, str(label))
                make_folder_if_not_exists(dest)
                img.save(os.path.join(dest,
                                      str(i) + '.png'),
                         "PNG",
                         quality=100)
Beispiel #3
0
 def _write_data_to_folder(zipfile, labels, folder, isTrainingset):
     print("Writing data to folder\n")
     for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:],
                                            labels)):
         with zipfile.open(enrty) as file:
             img = Image.open(file)
             dest = os.path.join(folder, str(label))
             make_folder_if_not_exists(dest)
             if isTrainingset == 1:
                 img.save(os.path.join(dest, str(i) + '.png'))
             else:
                 img.save(os.path.join(dest, str(i) + '.jpg'))
Beispiel #4
0
    def _write_data_to_folder(data, labels, folder, classes):
        dest = os.path.join(folder, 'images')
        make_folder_if_not_exists(dest)
        for image, label in zip(data, labels):
            shutil.copy(image, dest)

        rows = np.column_stack(
            ([os.path.join('images', os.path.basename(item))
              for item in data], labels))
        rows = sorted(rows,
                      key=lambda e: int(e[0].split('/')[1].split('.')[0]))
        output_csv = pd.DataFrame(rows)
        output_csv.to_csv(os.path.join(folder, 'labels.csv'),
                          header=classes,
                          index=False)
        return
Beispiel #5
0
def svhn(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the SVHN dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    torchvision.datasets.SVHN(root=args.output_folder,
                              split='train',
                              download=True)
    torchvision.datasets.SVHN(root=args.output_folder,
                              split='test',
                              download=True)

    # Load the data into memory
    train = _loadmat(os.path.join(args.output_folder, 'train_32x32.mat'))
    train_data, train_labels = train['X'], train['y'].astype(
        np.int64).squeeze()
    np.place(train_labels, train_labels == 10, 0)
    train_data = np.transpose(train_data, (3, 0, 1, 2))

    test = _loadmat(os.path.join(args.output_folder, 'test_32x32.mat'))
    test_data, test_labels = test['X'], test['y'].astype(np.int64).squeeze()
    np.place(test_labels, test_labels == 10, 0)
    test_data = np.transpose(test_data, (3, 0, 1, 2))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'SVHN')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    os.remove(os.path.join(args.output_folder, 'train_32x32.mat'))
    os.remove(os.path.join(args.output_folder, 'test_32x32.mat'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
Beispiel #6
0
def cifar10(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    cifar_train = torchvision.datasets.CIFAR10(root=args.output_folder,
                                               train=True,
                                               download=True)
    cifar_test = torchvision.datasets.CIFAR10(root=args.output_folder,
                                              train=False,
                                              download=True)

    # Load the data into memory
    train_data, train_labels = cifar_train.data, cifar_train.targets

    test_data, test_labels = cifar_test.data, cifar_test.targets

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'CIFAR10')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    os.remove(os.path.join(args.output_folder, 'cifar-10-python.tar.gz'))
    shutil.rmtree(os.path.join(args.output_folder, 'cifar-10-batches-py'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
Beispiel #7
0
def mnist(output_folder, **kwargs):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the MNIST dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Use torchvision to download the dataset
    torchvision.datasets.MNIST(root=output_folder, download=True)

    # Load the data into memory
    train_data, train_labels = torch.load(
        os.path.join(output_folder, 'MNIST', 'processed', 'training.pt'))
    test_data, test_labels = torch.load(
        os.path.join(output_folder, 'MNIST', 'processed', 'test.pt'))

    # Make output folders
    dataset_root = os.path.join(output_folder, 'MNIST')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels.detach().numpy())):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img.numpy(),
                            mode='L').save(os.path.join(dest,
                                                        str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    shutil.rmtree(os.path.join(output_folder, 'MNIST', 'raw'))
    shutil.rmtree(os.path.join(output_folder, 'MNIST', 'processed'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
Beispiel #8
0
def cifar10(output_folder, **kwargs):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the CIFAR dataset to the location specified
    on the file system

    Parameters
    ----------
    output_folder : str
        Path to folder where to put the dataset

    Returns
    -------
        None
    """
    # Make output folders
    dataset_root = os.path.join(output_folder, 'CIFAR10')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    if Path(dataset_root).exists():
        print(f"Path ({dataset_root}) already exists. Nothing done")
        return

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    # Use torchvision to download the dataset
    cifar_train = torchvision.datasets.CIFAR10(root=output_folder,
                                               train=True,
                                               download=True)
    cifar_test = torchvision.datasets.CIFAR10(root=output_folder,
                                              train=False,
                                              download=True)

    # Load the data into memory
    train_data, train_labels = cifar_train.data, cifar_train.targets

    test_data, test_labels = cifar_test.data, cifar_test.targets

    # Replace numbers with text for class names
    class_names_mapping = {
        0: 'plane',
        1: 'car',
        2: 'bird',
        3: ' cat',
        4: 'deer',
        5: 'dog',
        6: 'frog',
        7: 'horse',
        8: 'ship',
        9: 'truck'
    }
    train_labels = [class_names_mapping[l] for l in train_labels]
    test_labels = [class_names_mapping[l] for l in test_labels]

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img).save(os.path.join(dest, str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    os.remove(os.path.join(output_folder, 'cifar-10-python.tar.gz'))
    shutil.rmtree(os.path.join(output_folder, 'cifar-10-batches-py'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
Beispiel #9
0
def glas(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the tubule dataset (from the GlaS challenge) for semantic
    segmentation to the location specified on the file system

    See also: https://github.com/choosehappy/public/tree/master/DL%20tutorial%20Code/3-tubule

    Output folder structure: ../HisDB/GlaS/train
                             ../HisDB/GlaS/val
                             ../HisDB/GlaS/test

                             ../HisDB/GlaS/test/data -> images
                             ../HisDB/GlaS/test/gt   -> pixel-wise annotated ground truth

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    def groupby_patient(list_to_group, index=3):
        """
        split images by patient
        :param list_to_group: list of image names
        :param index: position of split by '-' in the image name to obtain patient ID
        :return:  dictionary where keys are patient IDs and values are lists of images that are from that patient
        """
        return {
            '-'.join(filename.split('-')[:index]): [
                file for file in list_to_group
                if '-'.join(file.split('-')[:index]) == '-'.join(
                    filename.split('-')[:index])
            ]
            for filename in list_to_group
        }

    def convert_gt(img_path):
        img = np.array(pil_loader(img_path))
        out_img = np.zeros(img.shape, dtype=np.uint8)
        out_img[:, :, 2] = 1  # set everything to background in blue channel
        out_img[:, :,
                2][img[:, :, 2] != 0] = 2  # set glands to 2 in blue channel

        out = Image.fromarray(out_img)
        out.save(img_path)

    # make the root folder
    output_folder = args.output_folder
    dataset_root = os.path.join(output_folder, 'GlaS')
    make_folder_if_not_exists(dataset_root)

    # links to HisDB data sets
    link_tubules = urllib.parse.urlparse(
        'https://warwick.ac.uk/fac/sci/dcs/research/tia/glascontest/download/warwick_qu_dataset_released_2016_07_08.zip'
    )

    download_path_tubules = os.path.join(
        dataset_root,
        link_tubules.geturl().rsplit('/', 1)[-1])

    # download files
    print('Downloading {}...'.format(link_tubules.geturl()))
    urllib.request.urlretrieve(link_tubules.geturl(), download_path_tubules)

    print('Download complete. Unpacking files...')

    # unpack tubule folder that contains images, annotations and text files with lists of benign and malignant samples
    zip_file = zipfile.ZipFile(download_path_tubules)
    zip_file.extractall(path=dataset_root)

    sets_dict = {}
    # 20 benign + 20 malignant images
    train_ids = ['train_']  # 4*5

    sets_dict['train'] = train_ids

    # validation has 29 images
    val_ids = ['testA_']  # 2*4 + 1 = 9

    sets_dict['val'] = val_ids

    # test has equal mal and ben and 16 img

    test_ids = ['testB_']  # 2*4 = 8

    sets_dict['test'] = test_ids

    print('Splitting the dataset into train, val and test')

    img_file_path = os.path.join(dataset_root,
                                 "Warwick QU Dataset (Released 2016_07_08)")

    for s in ['train', 'test', 'val']:
        make_folder_if_not_exists(os.path.join(dataset_root, s, 'gt'))
        make_folder_if_not_exists(os.path.join(dataset_root, s, 'data'))

        # print('CREATING {} SET'.format(s))

        for pattern in sets_dict[s]:
            for img_file in os.listdir(img_file_path):
                if pattern in img_file:
                    if 'anno' in img_file:
                        # convert gt into correct data format
                        convert_gt(os.path.join(img_file_path, img_file))
                        out_file = os.path.join('gt',
                                                img_file.replace('_anno', ''))
                    else:
                        out_file = os.path.join('data', img_file)

                    shutil.move(os.path.join(img_file_path, img_file),
                                os.path.join(dataset_root, s, out_file))
Beispiel #10
0
def miml(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset
    on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Download the files
    url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar'
    if not os.path.exists(
            os.path.join(args.output_folder, 'miml-image-data.rar')):
        print('Downloading file!')
        filename = wget.download(url, out=args.output_folder)
    else:
        print('File already downloaded!')
        filename = os.path.join(args.output_folder, 'miml-image-data.rar')

    # Extract the files
    path_to_rar = filename
    path_to_output = os.path.join(args.output_folder, 'tmp_miml')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    path_to_rar = os.path.join(path_to_output, 'original.rar')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    path_to_rar = os.path.join(path_to_output, 'processed.rar')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    print('Extracted files...')

    # Load the mat file
    mat = _loadmat(os.path.join(path_to_output, 'miml data.mat'))
    targets = mat['targets'].T
    classes = [item[0][0] for item in mat['class_name']]
    # Add filename at 0-index to correctly format the CSV headers
    classes.insert(0, 'filename')

    # Get list of all image files in the folder
    images = [
        item
        for item in _get_all_files_in_folders_and_subfolders(path_to_output)
        if item.endswith('jpg')
    ]
    images = sorted(images,
                    key=lambda e: int(os.path.basename(e).split('.')[0]))

    # Make splits
    train_data, test_data, train_labels, test_labels = _train_test_split(
        images, targets, test_size=0.2, random_state=42)
    train_data, val_data, train_labels, val_labels = _train_test_split(
        train_data, train_labels, test_size=0.2, random_state=42)

    # print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data),
    #                                                     len(val_data),
    #                                                     len(test_data)))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'MIML')
    train_folder = os.path.join(dataset_root, 'train')
    val_folder = os.path.join(dataset_root, 'val')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(val_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(data, labels, folder, classes):
        dest = os.path.join(folder, 'images')
        make_folder_if_not_exists(dest)
        for image, label in zip(data, labels):
            shutil.copy(image, dest)

        rows = np.column_stack(
            ([os.path.join('images', os.path.basename(item))
              for item in data], labels))
        rows = sorted(rows,
                      key=lambda e: int(e[0].split('/')[1].split('.')[0]))
        output_csv = pd.DataFrame(rows)
        output_csv.to_csv(os.path.join(folder, 'labels.csv'),
                          header=classes,
                          index=False)
        return

    # Write the images to the correct folders
    print('Writing the data to the filesystem')
    _write_data_to_folder(train_data, train_labels, train_folder, classes)
    _write_data_to_folder(val_data, val_labels, val_folder, classes)
    _write_data_to_folder(test_data, test_labels, test_folder, classes)

    os.remove(filename)
    shutil.rmtree(path_to_output)
    print('All done!')
    return
Beispiel #11
0
def kmnist(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the K-MNIST dataset to the location specified
    on the file system

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    def get_int(b):
        return int(codecs.encode(b, 'hex'), 16)

    def read_image_file(path):
        with open(path, 'rb') as f:
            data = f.read()
            assert get_int(data[:4]) == 2051
            length = get_int(data[4:8])
            num_rows = get_int(data[8:12])
            num_cols = get_int(data[12:16])
            images = []
            parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
            return torch.from_numpy(parsed).view(length, num_rows, num_cols)

    def read_label_file(path):
        with open(path, 'rb') as f:
            data = f.read()
            assert get_int(data[:4]) == 2049
            length = get_int(data[4:8])
            parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
            return torch.from_numpy(parsed).view(length).long()

    try:
        torchvision.datasets.KMNIST(root=args.output_folder, download=True)

    except AttributeError:
        url_list = [
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz'
        ]

        raw_folder = os.path.join(args.output_folder, 'raw')
        processed_folder = os.path.join(args.output_folder, 'processed')
        make_folder_if_not_exists(raw_folder)
        make_folder_if_not_exists(processed_folder)

        training_file = 'training.pt'
        test_file = 'test.pt'

        for url in url_list:
            print('Downloading ' + url)
            data = urllib.request.urlopen(url)
            filename = url.rpartition('/')[2]
            file_path = os.path.join(raw_folder, filename)
            with open(file_path, 'wb') as f:
                f.write(data.read())
            with open(file_path.replace('.gz', ''), 'wb') as out_f, \
                    gzip.GzipFile(file_path) as zip_f:
                out_f.write(zip_f.read())
            os.unlink(file_path)

        # process and save as torch files
        print('Processing...')

        training_set = (read_image_file(
            os.path.join(raw_folder, 'train-images-idx3-ubyte')),
                        read_label_file(
                            os.path.join(raw_folder,
                                         'train-labels-idx1-ubyte')))
        test_set = (read_image_file(
            os.path.join(raw_folder, 't10k-images-idx3-ubyte')),
                    read_label_file(
                        os.path.join(raw_folder, 't10k-labels-idx1-ubyte')))
        with open(os.path.join(processed_folder, training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(processed_folder, test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')

    # Load the data into memory
    train_data, train_labels = torch.load(
        os.path.join(args.output_folder, 'processed', 'training.pt'))
    test_data, test_labels = torch.load(
        os.path.join(args.output_folder, 'processed', 'test.pt'))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'KMNIST')
    train_folder = os.path.join(dataset_root, 'train')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(arr, labels, folder):
        for i, (img, label) in enumerate(zip(arr, labels)):
            dest = os.path.join(folder, str(label))
            make_folder_if_not_exists(dest)
            Image.fromarray(img.numpy(),
                            mode='L').save(os.path.join(dest,
                                                        str(i) + '.png'))

    # Write the images to the folders
    _write_data_to_folder(train_data, train_labels, train_folder)
    _write_data_to_folder(test_data, test_labels, test_folder)

    shutil.rmtree(os.path.join(args.output_folder, 'raw'))
    shutil.rmtree(os.path.join(args.output_folder, 'processed'))

    split_dataset(dataset_folder=dataset_root, split=0.2, symbolic=False)
    print("The KMNIST dataset is ready for you at {}".format(dataset_root))
Beispiel #12
0
def historical_wi(args):

    train_binarized_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-binarized.zip"
    train_colored_url = "ftp://scruffy.caa.tuwien.ac.at/staff/database/icdar2017/icdar17-historicalwi-training-color.zip"
    test_binarized_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-binarized.zip?download=1"
    test_colored_url = "https://zenodo.org/record/854353/files/ScriptNet-HistoricalWI-2017-color.zip?download=1"
    urls = [
        train_binarized_url, train_colored_url, test_binarized_url,
        test_colored_url
    ]

    zip_name_train_binarized = "icdar17-historicalwi-training-binarized.zip"
    zip_name_train_color = "icdar17-historicalwi-training-color.zip"
    zip_name_test_binarized = "ScriptNet-HistoricalWI-2017-binarized.zip"
    zip_name_test_color = "ScriptNet-HistoricalWI-2017-color.zip"
    zip_names = [
        zip_name_train_binarized, zip_name_train_color,
        zip_name_test_binarized, zip_name_test_color
    ]
    start_indices = [
        len("icdar2017-training-binary/"),
        len("icdar2017-training-color/"),
        len("ScriptNet-HistoricalWI-2017-binarized/"),
        len("ScriptNet-HistoricalWI-2017-color/")
    ]

    # Make output folders
    """
    dataset_root = os.path.join(args.output_folder)
    train_folder = os.path.join(dataset_root, 'train')
    train_binarized_folder = os.path.join(train_folder, 'Binarized')
    train_colored_folder = os.path.join(train_folder, 'Color')
    test_folder = os.path.join(dataset_root, 'test')
    test_binarized_folder = os.path.join(test_folder, 'Binarized')
    test_colored_folder = os.path.join(test_folder, 'Color')
    folders = [train_binarized_folder, train_colored_folder, test_binarized_folder, test_colored_folder]

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(train_binarized_folder)
    make_folder_if_not_exists(train_colored_folder)
    make_folder_if_not_exists(test_folder)
    make_folder_if_not_exists(test_binarized_folder)
    make_folder_if_not_exists(test_colored_folder)
    """
    dataset_root = os.path.join(
        os.path.join(args.output_folder, 'historical_wi'))
    binarized_dataset = os.path.join(dataset_root, "BinarizedDataset")
    train_binarized_folder = os.path.join(binarized_dataset, 'train')
    test_binarized_folder = os.path.join(binarized_dataset, 'test')
    colored_dataset = os.path.join(dataset_root, "ColoredDataset")
    train_colored_folder = os.path.join(colored_dataset, 'train')
    test_colored_folder = os.path.join(colored_dataset, 'test')
    folders = [
        train_binarized_folder, train_colored_folder, test_binarized_folder,
        test_colored_folder
    ]

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(binarized_dataset)
    make_folder_if_not_exists(colored_dataset)
    make_folder_if_not_exists(train_binarized_folder)
    make_folder_if_not_exists(train_colored_folder)
    make_folder_if_not_exists(test_binarized_folder)
    make_folder_if_not_exists(test_colored_folder)

    def _write_data_to_folder(zipfile, labels, folder, isTrainingset):
        print("Writing data to folder\n")
        for i, (enrty, label) in enumerate(zip(zipfile.infolist()[1:],
                                               labels)):
            with zipfile.open(enrty) as file:
                img = Image.open(file)
                dest = os.path.join(folder, str(label))
                make_folder_if_not_exists(dest)
                if isTrainingset == 1:
                    img.save(os.path.join(dest, str(i) + '.png'))
                else:
                    img.save(os.path.join(dest, str(i) + '.jpg'))

    def _get_labels(zipfile, start_index):
        print("Extracting labels\n")
        labels = []
        for zipinfo in zipfile.infolist()[1:]:
            file_name = zipinfo.filename
            ind = file_name.find("-", start_index)
            labels.append(file_name[start_index:ind])
        return labels

    #Prepare Datasets

    for i in range(len(urls)):
        if i < 2:
            isTrainingset = 1
        else:
            isTrainingset = 0

        print("Downloading " + urls[i])
        local_filename, headers = urllib.request.urlretrieve(
            urls[i], zip_names[i])
        zfile = zipfile.ZipFile(local_filename)
        labels = _get_labels(zfile, start_indices[i])
        _write_data_to_folder(zfile, labels, folders[i], isTrainingset)
        os.remove(os.path.join(zfile.filename))
        if i == 0:
            print("Binary training data is ready!")
        elif i == 1:
            print("Colored training data is ready!")
        elif i == 2:
            print("Binary test data is ready!")
        else:
            print("Colored test data is ready!")

    split_dataset_writerIdentification(dataset_folder=dataset_root, split=0.2)

    print("Historical WI dataset is ready!")
Beispiel #13
0
def icdar2017_clamm(args):

    url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_Training.zip"
    print("Downloading " + url)
    zip_name = "ICDAR2017_CLaMM_Training.zip"
    local_filename, headers = urllib.request.urlretrieve(url, zip_name)
    zfile = zipfile.ZipFile(local_filename)

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'ICDAR2017-CLAMM')
    dataset_manuscriptDating = os.path.join(dataset_root, 'ManuscriptDating')
    dataset_md_train = os.path.join(dataset_manuscriptDating, 'train')
    dataset_styleClassification = os.path.join(dataset_root,
                                               'StyleClassification')
    dataset_sc_train = os.path.join(dataset_styleClassification, 'train')
    test_sc_folder = os.path.join(dataset_styleClassification, 'test')
    test_md_folder = os.path.join(dataset_manuscriptDating, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(dataset_manuscriptDating)
    make_folder_if_not_exists(dataset_styleClassification)
    make_folder_if_not_exists(test_sc_folder)

    def _write_data_to_folder(zipfile, filenames, labels, folder, start_index,
                              isTest):
        print("Writing data\n")
        sorted_labels = [None] * len(labels)
        if isTest == 1:
            for i in range(len(zipfile.infolist())):
                entry = zipfile.infolist()[i]
                if "IRHT_P_009793.tif" in entry.filename:
                    zipfile.infolist().remove(entry)
                    break

        zip_infolist = zipfile.infolist()[1:]

        for i in range(len(zip_infolist)):
            entry = zip_infolist[i]
            entry_index_infilenames = filenames.index(
                entry.filename[start_index:])
            sorted_labels[i] = labels[entry_index_infilenames]

        for i, (enrty,
                label) in enumerate(zip(zipfile.infolist()[1:],
                                        sorted_labels)):
            with zipfile.open(enrty) as file:
                img = Image.open(file)
                dest = os.path.join(folder, str(label))
                make_folder_if_not_exists(dest)
                img.save(os.path.join(dest,
                                      str(i) + '.png'),
                         "PNG",
                         quality=100)

    def getLabels(zfile):
        print("Extracting labels\n")
        filenames, md_labels, sc_labels = [], [], []
        zip_infolist = zfile.infolist()[1:]
        for entry in zip_infolist:
            if '.csv' in entry.filename:
                with zfile.open(entry) as file:
                    cf = file.read()
                    c = csv.StringIO(cf.decode())
                    next(
                        c
                    )  # Skip the first line which is the header of csv file
                    for row in c:

                        md_label_strt_ind = row.rfind(';')
                        md_label_end_ind = row.rfind("\r")
                        md_labels.append(row[md_label_strt_ind +
                                             1:md_label_end_ind])
                        sc_labels_strt_ind = row[:md_label_strt_ind].rfind(';')
                        sc_labels.append(row[sc_labels_strt_ind +
                                             1:md_label_strt_ind])
                        filename_ind = row[:sc_labels_strt_ind].rfind(';')

                        if filename_ind > -1:
                            f_name = row[filename_ind + 1:sc_labels_strt_ind]
                        else:
                            f_name = row[:sc_labels_strt_ind]
                        if isTest == 1 and f_name == 'IRHT_P_009783.tif':
                            print('No file named ' + f_name +
                                  ". This filename will not be added!")
                        else:
                            filenames.append(f_name)

                zfile.infolist().remove(
                    entry)  # remove the csv file from infolist
            if '.db' in entry.filename:  # remove the db file from infolist
                zfile.infolist().remove(entry)
        return filenames, sc_labels, md_labels

    isTest = 0
    filenames, sc_labels, md_labels = getLabels(zfile)
    start_index_training = len("ICDAR2017_CLaMM_Training/")
    print("Training data is being prepared for style classification!\n")
    _write_data_to_folder(zfile, filenames, sc_labels, dataset_sc_train,
                          start_index_training, isTest)
    print("Training data is being prepared for manuscript dating!\n")
    _write_data_to_folder(zfile, filenames, md_labels, dataset_md_train,
                          start_index_training, isTest)

    os.remove(os.path.join(zfile.filename))

    url = "http://clamm.irht.cnrs.fr/wp-content/uploads/ICDAR2017_CLaMM_task1_task3.zip"
    print("Downloading " + url)
    zip_name_test = "ICDAR2017_CLaMM_task1_task3.zip"
    local_filename_test, headers_test = urllib.request.urlretrieve(
        url, zip_name_test)
    zfile_test = zipfile.ZipFile(local_filename_test)

    isTest = 1
    filenames_test, sc_test_labels, md_test_labels = getLabels(zfile_test)
    start_index_test = len("ICDAR2017_CLaMM_task1_task3/")
    print("Test data is being prepared for style classification!\n")
    _write_data_to_folder(zfile_test, filenames_test, sc_test_labels,
                          test_sc_folder, start_index_test, 1)
    print("Test data is being prepared for manuscript dating!\n")
    _write_data_to_folder(zfile_test, filenames_test, md_test_labels,
                          test_md_folder, start_index_test, 1)

    os.remove(os.path.join(zfile_test.filename))
    print("Training-Validation splitting\n")
    split_dataset(dataset_folder=dataset_manuscriptDating,
                  split=0.2,
                  symbolic=False)
    split_dataset(dataset_folder=dataset_styleClassification,
                  split=0.2,
                  symbolic=False)
    print("ICDAR2017 CLaMM data is ready!")
Beispiel #14
0
def diva_hisdb(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the DIVA HisDB-all dataset for semantic segmentation to the location specified
    on the file system

    See also: https://diuf.unifr.ch/main/hisdoc/diva-hisdb

    Output folder structure: ../HisDB/CB55/train
                             ../HisDB/CB55/val
                             ../HisDB/CB55/test

                             ../HisDB/CB55/test/data -> images
                             ../HisDB/CB55/test/gt   -> pixel-wise annotated ground truth

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # make the root folder
    dataset_root = os.path.join(args.output_folder, 'HisDB')
    make_folder_if_not_exists(dataset_root)

    # links to HisDB data sets
    link_public = urllib.parse.urlparse(
        'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/all.zip'
    )
    link_test_private = urllib.parse.urlparse(
        'https://diuf.unifr.ch/main/hisdoc/sites/diuf.unifr.ch.main.hisdoc/files/uploads/diva-hisdb/hisdoc/private-test/all-privateTest.zip'
    )
    download_path_public = os.path.join(
        dataset_root,
        link_public.geturl().rsplit('/', 1)[-1])
    download_path_private = os.path.join(
        dataset_root,
        link_test_private.geturl().rsplit('/', 1)[-1])

    # download files
    print('Downloading {}...'.format(link_public.geturl()))
    urllib.request.urlretrieve(link_public.geturl(), download_path_public)

    print('Downloading {}...'.format(link_test_private.geturl()))
    urllib.request.urlretrieve(link_test_private.geturl(),
                               download_path_private)
    print('Download complete. Unpacking files...')

    # unpack relevant folders
    zip_file = zipfile.ZipFile(download_path_public)

    # unpack imgs and gt
    data_gt_zip = {
        f: re.sub(r'img', 'pixel-level-gt', f)
        for f in zip_file.namelist() if 'img' in f
    }
    dataset_folders = [
        data_file.split('-')[-1][:-4] for data_file in data_gt_zip.keys()
    ]
    for data_file, gt_file in data_gt_zip.items():
        dataset_name = data_file.split('-')[-1][:-4]
        dataset_folder = os.path.join(dataset_root, dataset_name)
        make_folder_if_not_exists(dataset_folder)

        for file in [data_file, gt_file]:
            zip_file.extract(file, dataset_folder)
            with zipfile.ZipFile(os.path.join(dataset_folder, file),
                                 "r") as zip_ref:
                zip_ref.extractall(dataset_folder)
                # delete zips
                os.remove(os.path.join(dataset_folder, file))

        # create folder structure
        for partition in ['train', 'val', 'test', 'test-public']:
            for folder in ['data', 'gt']:
                make_folder_if_not_exists(
                    os.path.join(dataset_folder, partition, folder))

    # move the files to the correct place
    for folder in dataset_folders:
        for k1, v1 in {'pixel-level-gt': 'gt', 'img': 'data'}.items():
            for k2, v2 in {
                    'public-test': 'test-public',
                    'training': 'train',
                    'validation': 'val'
            }.items():
                current_path = os.path.join(dataset_root, folder, k1, k2)
                new_path = os.path.join(dataset_root, folder, v2, v1)
                for f in [
                        f for f in os.listdir(current_path)
                        if os.path.isfile(os.path.join(current_path, f))
                ]:
                    shutil.move(os.path.join(current_path, f),
                                os.path.join(new_path, f))
            # remove old folders
            shutil.rmtree(os.path.join(dataset_root, folder, k1))

    # fix naming issue
    for old, new in {'CS18': 'CSG18', 'CS863': 'CSG863'}.items():
        os.rename(os.path.join(dataset_root, old),
                  os.path.join(dataset_root, new))

    # unpack private test folders
    zip_file_private = zipfile.ZipFile(download_path_private)

    data_gt_zip_private = {
        f: re.sub(r'img', 'pixel-level-gt', f)
        for f in zip_file_private.namelist() if 'img' in f
    }

    for data_file, gt_file in data_gt_zip_private.items():
        dataset_name = re.search('-(.*)-', data_file).group(1)
        dataset_folder = os.path.join(dataset_root, dataset_name)

        for file in [data_file, gt_file]:
            zip_file_private.extract(file, dataset_folder)
            with zipfile.ZipFile(os.path.join(dataset_folder, file),
                                 "r") as zip_ref:
                zip_ref.extractall(os.path.join(dataset_folder, file[:-4]))
            # delete zip
            os.remove(os.path.join(dataset_folder, file))

        # create folder structure
        for folder in ['data', 'gt']:
            make_folder_if_not_exists(
                os.path.join(dataset_folder, 'test', folder))

        for old, new in {'pixel-level-gt': 'gt', 'img': 'data'}.items():
            current_path = os.path.join(
                dataset_folder, "{}-{}-privateTest".format(old, dataset_name),
                dataset_name)
            new_path = os.path.join(dataset_folder, "test", new)
            for f in [
                    f for f in os.listdir(current_path)
                    if os.path.isfile(os.path.join(current_path, f))
            ]:
                # the ground truth files in the private test set have an additional ending, which needs to be remove
                if new == "gt":
                    f_new = re.sub('_gt', r'', f)
                else:
                    f_new = f
                shutil.move(os.path.join(current_path, f),
                            os.path.join(new_path, f_new))

            # remove old folders
            shutil.rmtree(os.path.dirname(current_path))

    print('Finished. Data set up at {}.'.format(dataset_root))
def split_dataset_segmentation(dataset_folder, split, symbolic, test=False):
    """
    Partition a dataset into train/val(/test) splits on the filesystem for segmentation datasets organized
    as dataset/data with the images and dataset/gt for the ground truth. The corresponding images need to have the same
    name.

    Parameters
    ----------
    dataset_folder : str
        Path to the dataset folder (see datasets.image_folder_dataset.load_dataset for details).
    split : float
        Specifies how much of the training set should be converted into the validation set.
    symbolic : bool
        Does not make a copy of the data, but only symbolic links to the original data
    test: bool
        If true, the validation set is split again (1:1) into a val and test set. Default false.

    Returns
    -------
        None
    """
    # Getting the train dir
    orig_dir = os.path.join(dataset_folder, 'train')

    # Rename the original train dir
    shutil.move(orig_dir, os.path.join(dataset_folder, 'original_train'))
    orig_dir = os.path.join(dataset_folder, 'original_train')

    # Sanity check on the training folder
    if not os.path.isdir(orig_dir):
        print("Train folder not found in the args.dataset_folder={}".format(
            dataset_folder))
        sys.exit(-1)

    # get the dataset splits
    path_data = os.path.join(orig_dir, "data")
    path_gt = os.path.join(orig_dir, "gt")

    file_names_data = sorted([
        f for f in os.listdir(path_data)
        if os.path.isfile(os.path.join(path_data, f))
    ])
    file_names_gt = sorted([
        f for f in os.listdir(path_gt)
        if os.path.isfile(os.path.join(path_gt, f))
    ])

    # Check data and ensure everything is cool
    assert len(file_names_data) == len(file_names_gt)
    for data, gt in zip(file_names_data, file_names_gt):
        assert data[:
                    -3] == gt[:
                              -3]  # exclude the extension which should be jpg and png
        assert gt[-3:] == "png"

    # Split the data into two sets
    file_names = [(data, gt)
                  for data, gt in zip(file_names_data, file_names_gt)]
    filenames_train, filenames_val, _, _ = train_test_split(file_names,
                                                            file_names,
                                                            test_size=split,
                                                            random_state=42)

    if test:
        # Split the data into two sets
        filenames_val, filenames_test, _, _ = train_test_split(filenames_val,
                                                               filenames_val,
                                                               test_size=0.5,
                                                               random_state=42)

    # Make output folders
    dataset_root = os.path.join(dataset_folder)
    train_folder = os.path.join(dataset_root, 'train')
    val_folder = os.path.join(dataset_root, 'val')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(val_folder)

    if test:
        test_folder = os.path.join(dataset_root, 'test')
        make_folder_if_not_exists(test_folder)

    folders = [train_folder, val_folder, test_folder
               ] if test else [train_folder, val_folder]
    file_splits = [filenames_train, filenames_val, filenames_test
                   ] if test else [filenames_train, filenames_val]

    # Copying the splits into their folders
    for folder, split_files in zip(folders, file_splits):
        make_folder_if_not_exists(os.path.join(folder, 'data'))
        make_folder_if_not_exists(os.path.join(folder, 'gt'))

        for fdata, fgt in split_files:
            if symbolic:
                os.symlink(os.path.join(path_data, fdata),
                           os.path.join(folder, 'data', fdata))
                os.symlink(os.path.join(path_gt, fgt),
                           os.path.join(folder, 'gt', fgt))

            else:
                shutil.copy(os.path.join(path_data, fdata),
                            os.path.join(folder, 'data', fdata))
                shutil.copy(os.path.join(path_gt, fgt),
                            os.path.join(folder, 'gt', fgt))

    return
def glas(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the tubule dataset (from the GlaS challenge) for semantic
    segmentation to the location specified on the file system

    See also: https://github.com/choosehappy/public/tree/master/DL%20tutorial%20Code/3-tubule

    Output folder structure: ../HisDB/GlaS/train
                             ../HisDB/GlaS/val
                             ../HisDB/GlaS/test

                             ../HisDB/GlaS/test/data -> images
                             ../HisDB/GlaS/test/gt   -> pixel-wise annotated ground truth

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    def groupby_patient(list_to_group, index=3):
        """
        split images by patient
        :param list_to_group: list of image names
        :param index: position of split by '-' in the image name to obtain patient ID
        :return:  dictionary where keys are patient IDs and values are lists of images that are from that patient
        """
        return {
            '-'.join(filename.split('-')[:index]): [
                file for file in list_to_group
                if '-'.join(file.split('-')[:index]) == '-'.join(
                    filename.split('-')[:index])
            ]
            for filename in list_to_group
        }

    def convert_gt(img_path):
        img = pil_loader(img_path)

        out_img = np.zeros((*img.shape, 3), dtype=np.uint8)
        out_img[:, :, 2] = 1  # set everything to background in blue channel
        out_img[:, :, 2][img != 0] = 2  # set glands to 2 in blue channel

        out = Image.fromarray(out_img)
        out.save(img_path)

    # make the root folder
    output_folder = args.output_folder
    dataset_root = os.path.join(output_folder, 'GlaS')
    make_folder_if_not_exists(dataset_root)

    # links to HisDB data sets
    link_tubules = urllib.parse.urlparse(
        'http://andrewjanowczyk.com/wp-static/tubule.tgz')

    download_path_tubules = os.path.join(
        dataset_root,
        link_tubules.geturl().rsplit('/', 1)[-1])

    # download files
    print('Downloading {}...'.format(link_tubules.geturl()))
    urllib.request.urlretrieve(link_tubules.geturl(), download_path_tubules)

    print('Download complete. Unpacking files...')

    # unpack tubule folder that contains images, annotations and text files with lists of benign and malignant samples
    tar_file = tarfile.open(download_path_tubules)
    tar_file.extractall(path=dataset_root)

    sets_dict = {}
    # 20 benign + 20 malignant images
    train_ids_b = [
        '09-1339-01', '09-16566-03', '09-21631-03', '09-23232-02',
        'm9_10741F-12T2N0', '10-13799-05'
    ]  # 4*5

    train_ids_m = [
        '09-322-02', '09-16566-02', '10-13799-06', '10-15247-02',
        'm6_10719 T3N2a', 'm17_1421 IE-11 T3N2a', 'm18_1421 IE-11 1-86',
        'm39_10-1273'
    ]  # 5*4

    sets_dict['train'] = train_ids_b + train_ids_m

    # validation has 29 images
    val_ids_b = ['10-12813-05', '10-13799-02',
                 'm2_10449-11E-T3N1b']  # 2*4 + 1 = 9

    val_ids_m = [
        '09-1339-02', '09-1339-05', '09-1646-01', '09-1646-02', '09-23757-01'
    ]  # 5*4 = 20

    sets_dict['val'] = val_ids_b + val_ids_m

    # test has equal mal and ben and 16 img

    test_ids_m = ['09-1646-03', '09-1646-05']  # 2*4 = 8
    test_ids_b = ['10-12813-01', '10-13799-01']  # 2*4 = 8

    sets_dict['test'] = test_ids_b + test_ids_m

    print('Splitting the dataset into train, val and test')
    for s in ['train', 'test', 'val']:
        make_folder_if_not_exists(os.path.join(dataset_root, s, 'gt'))
        make_folder_if_not_exists(os.path.join(dataset_root, s, 'data'))

        print('CREATING {} SET'.format(s))
        for patient in sets_dict[s]:
            for img_file in os.listdir(dataset_root):
                if patient in img_file:
                    if 'anno' in img_file:
                        # convert gt into correct data format
                        convert_gt(os.path.join(dataset_root, img_file))
                        out_file = os.path.join('gt',
                                                img_file.replace('_anno', ''))
                    else:
                        out_file = os.path.join('data', img_file)

                    shutil.move(os.path.join(dataset_root, img_file),
                                os.path.join(dataset_root, s, out_file))