Beispiel #1
0
def image_consumer(socket, hdf5_file, num_expected, shuffle_seed=None,
                   offset=0):
    """Fill an HDF5 file with incoming images from a socket.

    Parameters
    ----------
    socket : :class:`zmq.Socket`
        PULL socket on which to receive images.
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write. Assumes `features`, `targets`
        and `filenames` already exist and have first dimension larger than
        `sum(images_per_class)`.
    num_expected : int
        The number of items we expect to be sent over the socket.
    shuffle_seed : int or sequence, optional
        Seed for a NumPy random number generator that permutes the
        images on disk.
    offset : int, optional
        The offset in the HDF5 datasets at which to start writing
        received examples. Defaults to 0.

    """
    with progress_bar('images', maxval=num_expected) as pb:
        if shuffle_seed is None:
            index_gen = iter(xrange(num_expected))
        else:
            rng = numpy.random.RandomState(shuffle_seed)
            index_gen = iter(rng.permutation(num_expected))
        for i, num in enumerate(index_gen):
            image_filename, class_index = socket.recv_pyobj(zmq.SNDMORE)
            image_data = numpy.fromstring(socket.recv(), dtype='uint8')
            _write_to_hdf5(hdf5_file, num + offset, image_filename,
                           image_data, class_index)
            pb.update(i + 1)
Beispiel #2
0
def image_consumer(socket, hdf5_file, num_expected, shuffle_seed=None,
                   offset=0):
    """Fill an HDF5 file with incoming images from a socket.

    Parameters
    ----------
    socket : :class:`zmq.Socket`
        PULL socket on which to receive images.
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write. Assumes `features`, `targets`
        and `filenames` already exist and have first dimension larger than
        `sum(images_per_class)`.
    num_expected : int
        The number of items we expect to be sent over the socket.
    shuffle_seed : int or sequence, optional
        Seed for a NumPy random number generator that permutes the
        images on disk.
    offset : int, optional
        The offset in the HDF5 datasets at which to start writing
        received examples. Defaults to 0.

    """
    with progress_bar('images', maxval=num_expected) as pb:
        if shuffle_seed is None:
            index_gen = iter(xrange(num_expected))
        else:
            rng = numpy.random.RandomState(shuffle_seed)
            index_gen = iter(rng.permutation(num_expected))
        for i, num in enumerate(index_gen):
            image_filename, class_index = socket.recv_pyobj(zmq.SNDMORE)
            image_data = numpy.fromstring(socket.recv(), dtype='uint8')
            _write_to_hdf5(hdf5_file, num + offset, image_filename,
                           image_data, class_index)
            pb.update(i + 1)
Beispiel #3
0
        def get_boxes(split):
            boxes = []
            with h5py.File(digit_struct_paths[split], 'r') as f:
                bar_name = '{} digitStruct'.format(split)
                bar_maxval = examples_per_split[split]
                with progress_bar(bar_name, bar_maxval) as bar:
                    for image_number in range(examples_per_split[split]):
                        # The 'digitStruct' group is the main group of the HDF5
                        # file. It contains two datasets: 'bbox' and 'name'.
                        # The 'name' dataset isn't of interest to us, as it
                        # stores file names and there's already a one-to-one
                        # mapping between row numbers and image names (e.g.
                        # row 0 corresponds to '1.png', row 1 corresponds to
                        # '2.png', and so on).
                        main_group = f['digitStruct']
                        # The 'bbox' dataset contains the bounding box and
                        # label information we're after. It has as many rows
                        # as there are images, and one column. Elements of the
                        # 'bbox' dataset are object references that point to
                        # (yet another) group that contains the information
                        # for the corresponding image.
                        image_reference = main_group['bbox'][image_number, 0]

                        # There are five datasets contained in that group:
                        # 'label', 'height', 'width', 'left' and 'top'. Each of
                        # those datasets has as many rows as there are bounding
                        # boxes in the corresponding image, and one column.
                        def get_dataset(name):
                            return main_group[image_reference][name][:, 0]

                        names = ('label', 'height', 'width', 'left', 'top')
                        datasets = dict([(name, get_dataset(name))
                                         for name in names])

                        # If there is only one bounding box, the information is
                        # stored directly in the datasets. If there are
                        # multiple bounding boxes, elements of those datasets
                        # are object references pointing to 1x1 datasets that
                        # store the information (fortunately, it's the last
                        # hop we need to make).
                        def get_elements(dataset):
                            if len(dataset) > 1:
                                return [
                                    int(main_group[reference][0, 0])
                                    for reference in dataset
                                ]
                            else:
                                return [int(dataset[0])]

                        # Names are pluralized in the BoundingBox named tuple.
                        kwargs = dict([(name + 's', get_elements(dataset))
                                       for name, dataset in iteritems(datasets)
                                       ])
                        boxes.append(BoundingBoxes(**kwargs))
                        if bar:
                            bar.update(image_number)
            return boxes
Beispiel #4
0
def load_images(split, tar, basename, rows):
    image_list = []
    progress_bar_context = progress_bar(name='{} images'.format(split),
                                        maxval=len(rows),
                                        prefix='Converting')
    with progress_bar_context as bar:
        for i, row in enumerate(rows):
            image_list.append(loadImagePairFromRow(tar, basename, row))
            bar.update(i)
    return np.array(image_list)
Beispiel #5
0
def load_images(split, tar, basename, rows):
    image_list = []
    progress_bar_context = progress_bar(
        name='{} images'.format(split), maxval=len(rows),
        prefix='Converting')
    with progress_bar_context as bar:
        for i, row in enumerate(rows):
            image_list.append(loadImagePairFromRow(tar, basename, row))
            bar.update(i)
    return np.array(image_list)
Beispiel #6
0
def convert_celeba_128(directory,
                       output_directory,
                       output_filename='celeba_128.hdf5'):
    """Converts the 128x128 version of the CelebA dataset to HDF5.

    This converter takes the aligned and cropped version of the
    CelebA dataset as input and produces a version that's been resized
    to 78x64 pixels and then center cropped to 128x128 pixels.

    Converts the CelebA dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CelebA`. The converted dataset is saved as
    'celeba_128.hdf5'.

    It assumes the existence of the following files:

    * `img_align_celeba.zip`
    * `list_attr_celeba.txt`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'celeba_64.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = _initialize_conversion(directory, output_path, (128, 128))

    features_dataset = h5file['features']
    image_file_path = os.path.join(directory, IMAGE_FILE)
    with zipfile.ZipFile(image_file_path, 'r') as image_file:
        with progress_bar('images', NUM_EXAMPLES) as bar:
            for i in range(NUM_EXAMPLES):
                image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1)

                image = Image.open(image_file.open(image_name, 'r')).resize(
                    (128, 128 + 7 * 4), Image.ANTIALIAS).crop(
                        (0, 7 * 2, 128, 128 + 7 * 2))

                features_dataset[i] = numpy.asarray(image).transpose(2, 0, 1)
                bar.update(i + 1)

    h5file.flush()
    h5file.close()

    return (output_path, )
Beispiel #7
0
        def get_boxes(split):
            boxes = []
            with h5py.File(digit_struct_paths[split], 'r') as f:
                bar_name = '{} digitStruct'.format(split)
                bar_maxval = examples_per_split[split]
                with progress_bar(bar_name, bar_maxval) as bar:
                    for image_number in range(examples_per_split[split]):
                        # The 'digitStruct' group is the main group of the HDF5
                        # file. It contains two datasets: 'bbox' and 'name'.
                        # The 'name' dataset isn't of interest to us, as it
                        # stores file names and there's already a one-to-one
                        # mapping between row numbers and image names (e.g.
                        # row 0 corresponds to '1.png', row 1 corresponds to
                        # '2.png', and so on).
                        main_group = f['digitStruct']
                        # The 'bbox' dataset contains the bounding box and
                        # label information we're after. It has as many rows
                        # as there are images, and one column. Elements of the
                        # 'bbox' dataset are object references that point to
                        # (yet another) group that contains the information
                        # for the corresponding image.
                        image_reference = main_group['bbox'][image_number, 0]

                        # There are five datasets contained in that group:
                        # 'label', 'height', 'width', 'left' and 'top'. Each of
                        # those datasets has as many rows as there are bounding
                        # boxes in the corresponding image, and one column.
                        def get_dataset(name):
                            return main_group[image_reference][name][:, 0]
                        names = ('label', 'height', 'width', 'left', 'top')
                        datasets = dict(
                            [(name, get_dataset(name)) for name in names])

                        # If there is only one bounding box, the information is
                        # stored directly in the datasets. If there are
                        # multiple bounding boxes, elements of those datasets
                        # are object references pointing to 1x1 datasets that
                        # store the information (fortunately, it's the last
                        # hop we need to make).
                        def get_elements(dataset):
                            if len(dataset) > 1:
                                return [int(main_group[reference][0, 0])
                                        for reference in dataset]
                            else:
                                return [int(dataset[0])]
                        # Names are pluralized in the BoundingBox named tuple.
                        kwargs = dict(
                            [(name + 's', get_elements(dataset))
                             for name, dataset in iteritems(datasets)])
                        boxes.append(BoundingBoxes(**kwargs))
                        if bar:
                            bar.update(image_number)
            return boxes
Beispiel #8
0
 def extract_tar(split):
     with tarfile.open(file_paths[split], 'r:gz') as f:
         members = f.getmembers()
         num_examples = sum(1 for m in members if '.png' in m.name)
         progress_bar_context = progress_bar(
             name='{} file'.format(split), maxval=len(members),
             prefix='Extracting')
         with progress_bar_context as bar:
             for i, member in enumerate(members):
                 f.extract(member, path=TMPDIR)
                 bar.update(i)
     return num_examples
Beispiel #9
0
 def extract_tar(split):
     with tarfile.open(file_paths[split], 'r:gz') as f:
         members = f.getmembers()
         num_examples = sum(1 for m in members if '.png' in m.name)
         progress_bar_context = progress_bar(
             name='{} file'.format(split), maxval=len(members),
             prefix='Extracting')
         with progress_bar_context as bar:
             for i, member in enumerate(members):
                 f.extract(member, path=TMPDIR)
                 bar.update(i)
     return num_examples
Beispiel #10
0
def convert_celeba_64(directory, output_directory,
                      output_filename='celeba_64.hdf5'):
    """Converts the 64x64 version of the CelebA dataset to HDF5.

    This converter takes the aligned and cropped version of the
    CelebA dataset as input and produces a version that's been resized
    to 78x64 pixels and then center cropped to 64x64 pixels.

    Converts the CelebA dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CelebA`. The converted dataset is saved as
    'celeba_64.hdf5'.

    It assumes the existence of the following files:

    * `img_align_celeba.zip`
    * `list_attr_celeba.txt`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'celeba_64.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = _initialize_conversion(directory, output_path, (64, 64))

    features_dataset = h5file['features']
    image_file_path = os.path.join(directory, IMAGE_FILE)
    with zipfile.ZipFile(image_file_path, 'r') as image_file:
        with progress_bar('images', NUM_EXAMPLES) as bar:
            for i in range(NUM_EXAMPLES):
                image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1)
                image = Image.open(
                    image_file.open(image_name, 'r')).resize(
                        (64, 78), Image.ANTIALIAS).crop((0, 7, 64, 64 + 7))
                features_dataset[i] = numpy.asarray(image).transpose(2, 0, 1)
                bar.update(i + 1)

    h5file.flush()
    h5file.close()

    return (output_path,)
Beispiel #11
0
def convert_celeba_aligned_cropped(directory,
                                   output_directory,
                                   output_filename=OUTPUT_FILENAME):
    """Converts the aligned and cropped CelebA dataset to HDF5.

    Converts the CelebA dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CelebA`. The converted dataset is saved as
    'celeba_aligned_cropped.hdf5'.

    It assumes the existence of the following files:

    * `img_align_celeba.zip`
    * `list_attr_celeba.txt`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to
        'celeba_aligned_cropped.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted
        dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = _initialize_conversion(directory, output_path, (218, 178))

    features_dataset = h5file['features']
    image_file_path = os.path.join(directory, IMAGE_FILE)
    with zipfile.ZipFile(image_file_path, 'r') as image_file:
        with progress_bar('images', NUM_EXAMPLES) as bar:
            for i in range(NUM_EXAMPLES):
                image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1)
                features_dataset[i] = numpy.asarray(
                    Image.open(image_file.open(image_name,
                                               'r'))).transpose(2, 0, 1)
                bar.update(i + 1)

    h5file.flush()
    h5file.close()

    return (output_path, )
Beispiel #12
0
def convert_celeba_aligned_cropped(directory, output_directory,
                                   output_filename=OUTPUT_FILENAME):
    """Converts the aligned and cropped CelebA dataset to HDF5.

    Converts the CelebA dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CelebA`. The converted dataset is saved as
    'celeba_aligned_cropped.hdf5'.

    It assumes the existence of the following files:

    * `img_align_celeba.zip`
    * `list_attr_celeba.txt`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to
        'celeba_aligned_cropped.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted
        dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = _initialize_conversion(directory, output_path, (218, 178))

    features_dataset = h5file['features']
    image_file_path = os.path.join(directory, IMAGE_FILE)
    with zipfile.ZipFile(image_file_path, 'r') as image_file:
        with progress_bar('images', NUM_EXAMPLES) as bar:
            for i in range(NUM_EXAMPLES):
                image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1)
                features_dataset[i] = numpy.asarray(
                    Image.open(
                        image_file.open(image_name, 'r'))).transpose(2, 0, 1)
                bar.update(i + 1)

    h5file.flush()
    h5file.close()

    return (output_path,)
Beispiel #13
0
def convert_camvid(directory, output_directory, output_filename='camvid.hdf5'):
    """Converts the camvid dataset to HDF5.

    Converts the camvid dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.camvid`. The converted dataset is
    saved as 'camvid.hdf5'.

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to
        'camvid_aligned_cropped.hdf5' or 'camvid_64.hdf5',
        depending on `which_format`.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = _initialize_conversion(directory, output_path, (360, 480))
    image_file_path = os.path.join(directory, DATASET_FILE)

    features_dataset = h5file['features']
    targets_dataset = h5file['targets']
    with zipfile.ZipFile(image_file_path, 'r'):
        with progress_bar('images', NUM_EXAMPLES) as bar:
            for files in DATASET_FILES:
                open_file = open(files, 'r')
                for i, line in enumerate(open_file):
                    image_name, target_name = line.split()
                    image = Image.open(image_name[15:], 'r')
                    target = Image.open(target_name[15:], 'r')
                    features_dataset[i] = numpy.asarray(image).transpose(
                        2, 0, 1)
                    targets_dataset[i] = numpy.asarray(target)
                    bar.update(i + 1)

    h5file.flush()
    h5file.close()

    return (output_path, )
Beispiel #14
0
def convert_svhn_format_1(directory,
                          output_directory,
                          output_filename='svhn_format_1.hdf5'):
    """Converts the SVHN dataset (format 1) to HDF5.

    This method assumes the existence of the files
    `{train,test,extra}.tar.gz`, which are accessible through the
    official website [SVHNSITE].

    .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    try:
        output_path = os.path.join(output_directory, output_filename)
        h5file = h5py.File(output_path, mode='w')
        TMPDIR = tempfile.mkdtemp()

        # Every image has three channels (RGB) and variable height and width.
        # It features a variable number of bounding boxes that identify the
        # location and label of digits. The bounding box location is specified
        # using the x and y coordinates of its top left corner along with its
        # width and height.
        BoundingBoxes = namedtuple(
            'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops'])
        sources = ('features', ) + tuple('bbox_{}'.format(field)
                                         for field in BoundingBoxes._fields)
        source_dtypes = dict([(source, 'uint8') for source in sources[:2]] +
                             [(source, 'uint16') for source in sources[2:]])
        source_axis_labels = {
            'features': ('channel', 'height', 'width'),
            'bbox_labels': ('bounding_box', 'index'),
            'bbox_heights': ('bounding_box', 'height'),
            'bbox_widths': ('bounding_box', 'width'),
            'bbox_lefts': ('bounding_box', 'x'),
            'bbox_tops': ('bounding_box', 'y')
        }

        # The dataset is split into three sets: the training set, the test set
        # and an extra set of examples that are somewhat less difficult but
        # can be used as extra training data. These sets are stored separately
        # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file
        # contains a directory named after the split it stores. The examples
        # are stored in that directory as PNG images. The directory also
        # contains a 'digitStruct.mat' file with all the bounding box and
        # label information.
        splits = ('train', 'test', 'extra')
        file_paths = dict(zip(splits, FORMAT_1_FILES))
        for split, path in file_paths.items():
            file_paths[split] = os.path.join(directory, path)
        digit_struct_paths = dict([(split,
                                    os.path.join(TMPDIR, split,
                                                 'digitStruct.mat'))
                                   for split in splits])

        # We first extract the data files in a temporary directory. While doing
        # that, we also count the number of examples for each split. Files are
        # extracted individually, which allows to display a progress bar. Since
        # the splits will be concatenated in the HDF5 file, we also compute the
        # start and stop intervals of each split within the concatenated array.
        def extract_tar(split):
            with tarfile.open(file_paths[split], 'r:gz') as f:
                members = f.getmembers()
                num_examples = sum(1 for m in members if '.png' in m.name)
                progress_bar_context = progress_bar(
                    name='{} file'.format(split),
                    maxval=len(members),
                    prefix='Extracting')
                with progress_bar_context as bar:
                    for i, member in enumerate(members):
                        f.extract(member, path=TMPDIR)
                        bar.update(i)
            return num_examples

        examples_per_split = OrderedDict([(split, extract_tar(split))
                                          for split in splits])
        cumulative_num_examples = numpy.cumsum(
            [0] + list(examples_per_split.values()))
        num_examples = cumulative_num_examples[-1]
        intervals = zip(cumulative_num_examples[:-1],
                        cumulative_num_examples[1:])
        split_intervals = dict(zip(splits, intervals))

        # The start and stop indices are used to create a split dict that will
        # be parsed into the split array required by the H5PYDataset interface.
        # The split dict is organized as follows:
        #
        #     dict(split -> dict(source -> (start, stop)))
        #
        split_dict = OrderedDict([(split,
                                   OrderedDict([(s, split_intervals[split])
                                                for s in sources]))
                                  for split in splits])
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        # We then prepare the HDF5 dataset. This involves creating datasets to
        # store data sources and datasets to store auxiliary information
        # (namely the shapes for variable-length axes, and labels to indicate
        # what these variable-length axes represent).
        def make_vlen_dataset(source):
            # Create a variable-length 1D dataset
            dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source]))
            dataset = h5file.create_dataset(source, (num_examples, ),
                                            dtype=dtype)
            # Create a dataset to store variable-length shapes.
            axis_labels = source_axis_labels[source]
            dataset_shapes = h5file.create_dataset(
                '{}_shapes'.format(source), (num_examples, len(axis_labels)),
                dtype='uint16')
            # Create a dataset to store labels for variable-length axes.
            dataset_vlen_axis_labels = h5file.create_dataset(
                '{}_vlen_axis_labels'.format(source), (len(axis_labels), ),
                dtype='S{}'.format(
                    numpy.max([len(label) for label in axis_labels])))
            # Fill variable-length axis labels
            dataset_vlen_axis_labels[...] = [
                label.encode('utf8') for label in axis_labels
            ]
            # Attach auxiliary datasets as dimension scales of the
            # variable-length 1D dataset. This is in accordance with the
            # H5PYDataset interface.
            dataset.dims.create_scale(dataset_shapes, 'shapes')
            dataset.dims[0].attach_scale(dataset_shapes)
            dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels')
            dataset.dims[0].attach_scale(dataset_vlen_axis_labels)
            # Tag fixed-length axis with its label
            dataset.dims[0].label = 'batch'

        for source in sources:
            make_vlen_dataset(source)

        # The "fun" part begins: we extract the bounding box and label
        # information contained in 'digitStruct.mat'. This is a version 7.3
        # Matlab file, which uses HDF5 under the hood, albeit with a very
        # convoluted layout.
        def get_boxes(split):
            boxes = []
            with h5py.File(digit_struct_paths[split], 'r') as f:
                bar_name = '{} digitStruct'.format(split)
                bar_maxval = examples_per_split[split]
                with progress_bar(bar_name, bar_maxval) as bar:
                    for image_number in range(examples_per_split[split]):
                        # The 'digitStruct' group is the main group of the HDF5
                        # file. It contains two datasets: 'bbox' and 'name'.
                        # The 'name' dataset isn't of interest to us, as it
                        # stores file names and there's already a one-to-one
                        # mapping between row numbers and image names (e.g.
                        # row 0 corresponds to '1.png', row 1 corresponds to
                        # '2.png', and so on).
                        main_group = f['digitStruct']
                        # The 'bbox' dataset contains the bounding box and
                        # label information we're after. It has as many rows
                        # as there are images, and one column. Elements of the
                        # 'bbox' dataset are object references that point to
                        # (yet another) group that contains the information
                        # for the corresponding image.
                        image_reference = main_group['bbox'][image_number, 0]

                        # There are five datasets contained in that group:
                        # 'label', 'height', 'width', 'left' and 'top'. Each of
                        # those datasets has as many rows as there are bounding
                        # boxes in the corresponding image, and one column.
                        def get_dataset(name):
                            return main_group[image_reference][name][:, 0]

                        names = ('label', 'height', 'width', 'left', 'top')
                        datasets = dict([(name, get_dataset(name))
                                         for name in names])

                        # If there is only one bounding box, the information is
                        # stored directly in the datasets. If there are
                        # multiple bounding boxes, elements of those datasets
                        # are object references pointing to 1x1 datasets that
                        # store the information (fortunately, it's the last
                        # hop we need to make).
                        def get_elements(dataset):
                            if len(dataset) > 1:
                                return [
                                    int(main_group[reference][0, 0])
                                    for reference in dataset
                                ]
                            else:
                                return [int(dataset[0])]

                        # Names are pluralized in the BoundingBox named tuple.
                        kwargs = dict([(name + 's', get_elements(dataset))
                                       for name, dataset in iteritems(datasets)
                                       ])
                        boxes.append(BoundingBoxes(**kwargs))
                        if bar:
                            bar.update(image_number)
            return boxes

        split_boxes = dict([(split, get_boxes(split)) for split in splits])

        # The final step is to fill the HDF5 file.
        def fill_split(split, bar=None):
            for image_number in range(examples_per_split[split]):
                image_path = os.path.join(TMPDIR, split,
                                          '{}.png'.format(image_number + 1))
                image = numpy.asarray(Image.open(image_path)).transpose(
                    2, 0, 1)
                bounding_boxes = split_boxes[split][image_number]
                num_boxes = len(bounding_boxes.labels)
                index = image_number + split_intervals[split][0]

                h5file['features'][index] = image.flatten()
                h5file['features'].dims[0]['shapes'][index] = image.shape
                for field in BoundingBoxes._fields:
                    name = 'bbox_{}'.format(field)
                    h5file[name][index] = numpy.maximum(
                        0, getattr(bounding_boxes, field))
                    h5file[name].dims[0]['shapes'][index] = [num_boxes, 1]

                # Replace label '10' with '0'.
                labels = h5file['bbox_labels'][index]
                labels[labels == 10] = 0
                h5file['bbox_labels'][index] = labels

                if image_number % 1000 == 0:
                    h5file.flush()
                if bar:
                    bar.update(index)

        with progress_bar('SVHN format 1', num_examples) as bar:
            for split in splits:
                fill_split(split, bar=bar)
    finally:
        if os.path.isdir(TMPDIR):
            shutil.rmtree(TMPDIR)
        h5file.flush()
        h5file.close()

    return (output_path, )
Beispiel #15
0
def convert_svhn_format_1(directory, output_directory,
                          output_filename='svhn_format_1.hdf5'):
    """Converts the SVHN dataset (format 1) to HDF5.

    This method assumes the existence of the files
    `{train,test,extra}.tar.gz`, which are accessible through the
    official website [SVHNSITE].

    .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    try:
        output_path = os.path.join(output_directory, output_filename)
        h5file = h5py.File(output_path, mode='w')
        TMPDIR = tempfile.mkdtemp()

        # Every image has three channels (RGB) and variable height and width.
        # It features a variable number of bounding boxes that identify the
        # location and label of digits. The bounding box location is specified
        # using the x and y coordinates of its top left corner along with its
        # width and height.
        BoundingBoxes = namedtuple(
            'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops'])
        sources = ('features',) + tuple('bbox_{}'.format(field)
                                        for field in BoundingBoxes._fields)
        source_dtypes = dict([(source, 'uint8') for source in sources[:2]] +
                             [(source, 'uint16') for source in sources[2:]])
        source_axis_labels = {
            'features': ('channel', 'height', 'width'),
            'bbox_labels': ('bounding_box', 'index'),
            'bbox_heights': ('bounding_box', 'height'),
            'bbox_widths': ('bounding_box', 'width'),
            'bbox_lefts': ('bounding_box', 'x'),
            'bbox_tops': ('bounding_box', 'y')}

        # The dataset is split into three sets: the training set, the test set
        # and an extra set of examples that are somewhat less difficult but
        # can be used as extra training data. These sets are stored separately
        # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file
        # contains a directory named after the split it stores. The examples
        # are stored in that directory as PNG images. The directory also
        # contains a 'digitStruct.mat' file with all the bounding box and
        # label information.
        splits = ('train', 'test', 'extra')
        file_paths = dict(zip(splits, FORMAT_1_FILES))
        for split, path in file_paths.items():
            file_paths[split] = os.path.join(directory, path)
        digit_struct_paths = dict(
            [(split, os.path.join(TMPDIR, split, 'digitStruct.mat'))
             for split in splits])

        # We first extract the data files in a temporary directory. While doing
        # that, we also count the number of examples for each split. Files are
        # extracted individually, which allows to display a progress bar. Since
        # the splits will be concatenated in the HDF5 file, we also compute the
        # start and stop intervals of each split within the concatenated array.
        def extract_tar(split):
            with tarfile.open(file_paths[split], 'r:gz') as f:
                members = f.getmembers()
                num_examples = sum(1 for m in members if '.png' in m.name)
                progress_bar_context = progress_bar(
                    name='{} file'.format(split), maxval=len(members),
                    prefix='Extracting')
                with progress_bar_context as bar:
                    for i, member in enumerate(members):
                        f.extract(member, path=TMPDIR)
                        bar.update(i)
            return num_examples

        examples_per_split = OrderedDict(
            [(split, extract_tar(split)) for split in splits])
        cumulative_num_examples = numpy.cumsum(
            [0] + list(examples_per_split.values()))
        num_examples = cumulative_num_examples[-1]
        intervals = zip(cumulative_num_examples[:-1],
                        cumulative_num_examples[1:])
        split_intervals = dict(zip(splits, intervals))

        # The start and stop indices are used to create a split dict that will
        # be parsed into the split array required by the H5PYDataset interface.
        # The split dict is organized as follows:
        #
        #     dict(split -> dict(source -> (start, stop)))
        #
        split_dict = OrderedDict([
            (split, OrderedDict([(s, split_intervals[split])
                                 for s in sources]))
            for split in splits])
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        # We then prepare the HDF5 dataset. This involves creating datasets to
        # store data sources and datasets to store auxiliary information
        # (namely the shapes for variable-length axes, and labels to indicate
        # what these variable-length axes represent).
        def make_vlen_dataset(source):
            # Create a variable-length 1D dataset
            dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source]))
            dataset = h5file.create_dataset(
                source, (num_examples,), dtype=dtype)
            # Create a dataset to store variable-length shapes.
            axis_labels = source_axis_labels[source]
            dataset_shapes = h5file.create_dataset(
                '{}_shapes'.format(source), (num_examples, len(axis_labels)),
                dtype='uint16')
            # Create a dataset to store labels for variable-length axes.
            dataset_vlen_axis_labels = h5file.create_dataset(
                '{}_vlen_axis_labels'.format(source), (len(axis_labels),),
                dtype='S{}'.format(
                    numpy.max([len(label) for label in axis_labels])))
            # Fill variable-length axis labels
            dataset_vlen_axis_labels[...] = [
                label.encode('utf8') for label in axis_labels]
            # Attach auxiliary datasets as dimension scales of the
            # variable-length 1D dataset. This is in accordance with the
            # H5PYDataset interface.
            dataset.dims.create_scale(dataset_shapes, 'shapes')
            dataset.dims[0].attach_scale(dataset_shapes)
            dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels')
            dataset.dims[0].attach_scale(dataset_vlen_axis_labels)
            # Tag fixed-length axis with its label
            dataset.dims[0].label = 'batch'

        for source in sources:
            make_vlen_dataset(source)

        # The "fun" part begins: we extract the bounding box and label
        # information contained in 'digitStruct.mat'. This is a version 7.3
        # Matlab file, which uses HDF5 under the hood, albeit with a very
        # convoluted layout.
        def get_boxes(split):
            boxes = []
            with h5py.File(digit_struct_paths[split], 'r') as f:
                bar_name = '{} digitStruct'.format(split)
                bar_maxval = examples_per_split[split]
                with progress_bar(bar_name, bar_maxval) as bar:
                    for image_number in range(examples_per_split[split]):
                        # The 'digitStruct' group is the main group of the HDF5
                        # file. It contains two datasets: 'bbox' and 'name'.
                        # The 'name' dataset isn't of interest to us, as it
                        # stores file names and there's already a one-to-one
                        # mapping between row numbers and image names (e.g.
                        # row 0 corresponds to '1.png', row 1 corresponds to
                        # '2.png', and so on).
                        main_group = f['digitStruct']
                        # The 'bbox' dataset contains the bounding box and
                        # label information we're after. It has as many rows
                        # as there are images, and one column. Elements of the
                        # 'bbox' dataset are object references that point to
                        # (yet another) group that contains the information
                        # for the corresponding image.
                        image_reference = main_group['bbox'][image_number, 0]

                        # There are five datasets contained in that group:
                        # 'label', 'height', 'width', 'left' and 'top'. Each of
                        # those datasets has as many rows as there are bounding
                        # boxes in the corresponding image, and one column.
                        def get_dataset(name):
                            return main_group[image_reference][name][:, 0]
                        names = ('label', 'height', 'width', 'left', 'top')
                        datasets = dict(
                            [(name, get_dataset(name)) for name in names])

                        # If there is only one bounding box, the information is
                        # stored directly in the datasets. If there are
                        # multiple bounding boxes, elements of those datasets
                        # are object references pointing to 1x1 datasets that
                        # store the information (fortunately, it's the last
                        # hop we need to make).
                        def get_elements(dataset):
                            if len(dataset) > 1:
                                return [int(main_group[reference][0, 0])
                                        for reference in dataset]
                            else:
                                return [int(dataset[0])]
                        # Names are pluralized in the BoundingBox named tuple.
                        kwargs = dict(
                            [(name + 's', get_elements(dataset))
                             for name, dataset in iteritems(datasets)])
                        boxes.append(BoundingBoxes(**kwargs))
                        if bar:
                            bar.update(image_number)
            return boxes

        split_boxes = dict([(split, get_boxes(split)) for split in splits])

        # The final step is to fill the HDF5 file.
        def fill_split(split, bar=None):
            for image_number in range(examples_per_split[split]):
                image_path = os.path.join(
                    TMPDIR, split, '{}.png'.format(image_number + 1))
                image = numpy.asarray(
                    Image.open(image_path)).transpose(2, 0, 1)
                bounding_boxes = split_boxes[split][image_number]
                num_boxes = len(bounding_boxes.labels)
                index = image_number + split_intervals[split][0]

                h5file['features'][index] = image.flatten()
                h5file['features'].dims[0]['shapes'][index] = image.shape
                for field in BoundingBoxes._fields:
                    name = 'bbox_{}'.format(field)
                    h5file[name][index] = getattr(bounding_boxes, field)
                    h5file[name].dims[0]['shapes'][index] = [num_boxes, 1]

                # Replace label '10' with '0'.
                labels = h5file['bbox_labels'][index]
                labels[labels == 10] = 0
                h5file['bbox_labels'][index] = labels

                if image_number % 1000 == 0:
                    h5file.flush()
                if bar:
                    bar.update(index)

        with progress_bar('SVHN format 1', num_examples) as bar:
            for split in splits:
                fill_split(split, bar=bar)
    finally:
        if os.path.isdir(TMPDIR):
            shutil.rmtree(TMPDIR)
        h5file.flush()
        h5file.close()

    return (output_path,)
Beispiel #16
0
#       output_case.append(d[0])
#   output.append(output_case)
#   return output
index_list        = []
index             = 1
index_list.append(index)
images_output     = []
multiplier_output = []
cases_output      = []
output_ind        = []
sax_indexes       = []
sax_indexes_tmp   = []
positions = []
positions_tmp = []
i = 0
with progress_bar('train', n_examples_train) as bar:
    for sequence in train_features:
        stri          = sequence[0]
        m             = re.search('train/(.+?)/study', stri)
        case_index    = int(m.group(1))
        if case_index != index:
            sax_indexes.append(list(numpy.unique(numpy.array(sax_indexes_tmp))))
            cases_output.append(index)
            images_output.append(output_ind)
            output_ind   = []
            sax_indexes_tmp = []
            positions.append(positions_tmp)
            positions_tmp = []
            index       = case_index
            index_list.append(index)
            multiplier_output.append(multiplier)
Beispiel #17
0
def convert_dogs_vs_cats(directory,
                         output_directory,
                         output_filename='dogs_vs_cats.hdf5'):
    """Converts the Dogs vs. Cats dataset to HDF5.

    Converts the Dogs vs. Cats dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.dogs_vs_cats`. The converted dataset is saved as
    'dogs_vs_cats.hdf5'.

    It assumes the existence of the following files:

    * `dogs_vs_cats.train.zip`
    * `dogs_vs_cats.test1.zip`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'dogs_vs_cats.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    # Prepare output file
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf_features = h5file.create_dataset('image_features', (37500, ),
                                         dtype=dtype)
    hdf_shapes = h5file.create_dataset('image_features_shapes', (37500, 3),
                                       dtype='int32')
    hdf_labels = h5file.create_dataset('targets', (37500, 1), dtype='uint8')

    # Attach shape annotations and scales
    hdf_features.dims.create_scale(hdf_shapes, 'shapes')
    hdf_features.dims[0].attach_scale(hdf_shapes)

    hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels',
                                              (3, ),
                                              dtype='S7')
    hdf_shapes_labels[...] = [
        'channel'.encode('utf8'), 'height'.encode('utf8'),
        'width'.encode('utf8')
    ]
    hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
    hdf_features.dims[0].attach_scale(hdf_shapes_labels)

    # Add axis annotations
    hdf_features.dims[0].label = 'batch'
    hdf_labels.dims[0].label = 'batch'
    hdf_labels.dims[1].label = 'index'

    # Convert
    i = 0
    for split, split_size in zip([TRAIN, TEST], [25000, 12500]):
        # Open the ZIP file
        filename = os.path.join(directory, split)
        zip_file = zipfile.ZipFile(filename, 'r')
        image_names = zip_file.namelist()[1:]  # Discard the directory name

        # Shuffle the examples
        rng = numpy.random.RandomState(123522)
        rng.shuffle(image_names)

        # Convert from JPEG to NumPy arrays
        with progress_bar(filename, split_size) as bar:
            for image_name in image_names:
                # Save image
                image = numpy.array(Image.open(zip_file.open(image_name)))
                image = image.transpose(2, 0, 1)
                hdf_features[i] = image.flatten()
                hdf_shapes[i] = image.shape

                # Cats are 0, Dogs are 1
                hdf_labels[i] = 0 if 'cat' in image_name else 1

                # Update progress
                i += 1
                bar.update(i if split == TRAIN else i - 25000)

    # Add the labels
    split_dict = {}
    sources = ['image_features', 'targets']
    for name, slice_ in zip(['train', 'test'], [(0, 25000), (25000, 37500)]):
        split_dict[name] = dict(zip(sources, [slice_] * len(sources)))
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    h5file.flush()
    h5file.close()

    return (output_path, )
Beispiel #18
0
hdf_images.dims[1].label = 'height'
hdf_images.dims[2].label = 'width'
hdf_images.dims[3].label = 'channels'
hdf_driver_id.dims[0].label = 'batch'
hdf_labels.dims[0].label = 'batch'

# Create matching for creating valid and train set
validation_set_indexes_list = list(validation_set_indexes)
indexes = range(n_examples_train)
for ind in validation_set_indexes_list:
    indexes.remove(ind)
indexes = indexes + validation_set_indexes_list
assert (len(indexes) == n_examples_train)

# build hdf5 train and submit
with progress_bar('train', n_examples_train) as bar:
    for j in range(n_examples_train):
        hdf_images[j] = numpy.rollaxis(X_train[indexes[j]], 2, 0)
        hdf_labels[j] = y_train[indexes[j]]
        hdf_driver_id[j] = int(driver_id[indexes[j]][1:])
        bar.update(j)

with progress_bar('submit', n_examples_submit) as bar:
    for j in range(n_examples_submit):
        hdf_images[n_examples_train + j] = numpy.rollaxis(X_test[j], 2, 0)
        hdf_driver_id[n_examples_train + j] = int(
            X_test_id[j].split('_')[1][:-4])
        bar.update(j)

# Save hdf5 train and submit
split_dict = {}
Beispiel #19
0
hdf_labels.dims.create_scale(hdf_shapes_labels, 'shape_labels')
hdf_labels.dims[0].attach_scale(hdf_shapes_labels)

# Add axis annotations
hdf_features.dims[0].label = 'batch'
hdf_labels.dims[0].label   = 'batch'
hdf_cases.dims[0].label    = 'batch'
hdf_cases.dims[1].label    = 'index'
hdf_mult.dims[0].label     = 'batch'
hdf_mult.dims[1].label     = 'index'

### loading train
i = 0

with progress_bar('train_data ', n_train) as bar:
  for c in cases_train:
    [d,l], m = get_data(train_contour_path, train_img_path, c)
    train_images    = numpy.array(d)
    label_images    = numpy.array(l)
    assert(train_images.shape == label_images.shape)
    hdf_shapes[i]   = train_images.shape
    hdf_features[i] = train_images.flatten().astype(numpy.dtype('uint16'))
    hdf_mult[i]     = m
    hdf_labels[i]   = label_images.flatten().astype(numpy.dtype('uint16'))
    hdf_cases[i]    = i
    i += 1
    bar.update(i)

with progress_bar('online_data ', n_online) as bar:
  for c in cases_online:
Beispiel #20
0
def convert_dogs_vs_cats(directory, output_directory,
                         output_filename='dogs_vs_cats.hdf5'):
    """Converts the Dogs vs. Cats dataset to HDF5.

    Converts the Dogs vs. Cats dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.dogs_vs_cats`. The converted dataset is saved as
    'dogs_vs_cats.hdf5'.

    It assumes the existence of the following files:

    * `dogs_vs_cats.train.zip`
    * `dogs_vs_cats.test1.zip`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'dogs_vs_cats.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    # Prepare output file
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf_features = h5file.create_dataset('image_features', (37500,),
                                         dtype=dtype)
    hdf_shapes = h5file.create_dataset('image_features_shapes', (37500, 3),
                                       dtype='int32')
    hdf_labels = h5file.create_dataset('targets', (37500, 1), dtype='uint8')

    # Attach shape annotations and scales
    hdf_features.dims.create_scale(hdf_shapes, 'shapes')
    hdf_features.dims[0].attach_scale(hdf_shapes)

    hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels',
                                              (3,), dtype='S7')
    hdf_shapes_labels[...] = ['channel'.encode('utf8'),
                              'height'.encode('utf8'),
                              'width'.encode('utf8')]
    hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
    hdf_features.dims[0].attach_scale(hdf_shapes_labels)

    # Add axis annotations
    hdf_features.dims[0].label = 'batch'
    hdf_labels.dims[0].label = 'batch'
    hdf_labels.dims[1].label = 'index'

    # Convert
    i = 0
    for split, split_size in zip([TRAIN, TEST], [25000, 12500]):
        # Open the ZIP file
        filename = os.path.join(directory, split)
        zip_file = zipfile.ZipFile(filename, 'r')
        image_names = zip_file.namelist()[1:]  # Discard the directory name

        # Shuffle the examples
        rng = numpy.random.RandomState(123522)
        rng.shuffle(image_names)

        # Convert from JPEG to NumPy arrays
        with progress_bar(filename, split_size) as bar:
            for image_name in image_names:
                # Save image
                image = numpy.array(Image.open(zip_file.open(image_name)))
                image = image.transpose(2, 0, 1)
                hdf_features[i] = image.flatten()
                hdf_shapes[i] = image.shape

                # Cats are 0, Dogs are 1
                hdf_labels[i] = 0 if 'cat' in image_name else 1

                # Update progress
                i += 1
                bar.update(i if split == TRAIN else i - 25000)

    # Add the labels
    split_dict = {}
    sources = ['image_features', 'targets']
    for name, slice_ in zip(['train', 'test'],
                            [(0, 25000), (25000, 37500)]):
        split_dict[name] = dict(zip(sources, [slice_] * len(sources)))
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    h5file.flush()
    h5file.close()

    return (output_path,)
Beispiel #21
0
output_path = basepath + 'data_tiff.hdf5'
h5file = h5py.File(output_path, mode='w')

hdf_features                = h5file.create_dataset('features', (nb_test_examples + nb_training_examples, nb_channels, length, width), dtype='int32')
hdf_labels                  = h5file.create_dataset('labels', (nb_test_examples + nb_training_examples, nb_labels), dtype='int8')
hdf_names                   = h5file.create_dataset('image_name', (nb_test_examples + nb_training_examples,), dtype='S20')

hdf_features.dims[0].label    = 'batch'
hdf_features.dims[2].label    = 'height'
hdf_features.dims[3].label    = 'width'
hdf_features.dims[1].label    = 'channels'
hdf_labels.dims[0].label      = 'batch'
hdf_labels.dims[1].label      = 'labels'

maximums = np.zeros(4)
with progress_bar('train', nb_training_examples) as bar:

    with open(basepath + 'train_v2.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile)
        count      = 0
        for row in spamreader:
            if row[0].startswith('train_'):
                train_idx           = int(row[0].split('_')[-1])
                p                   = glob.glob(basepath + 'train-tif-v2/train_{0}.tif'.format(train_idx))
                hdf_names[count]    = 'train_{0}.tif'.format(train_idx)
                # tiff image
                img                 = io.imread(p[0])
                rescaleIMG          = np.reshape(img[:,:,-1], (-1, 1))
                rescaleIMG          = scaler.fit_transform(rescaleIMG.astype(np.float32))
                img_scaled          = (np.reshape(rescaleIMG, img[:,:,-1].shape)).astype(np.uint8)
                #img                 = np.moveaxis(img_scaled[:,:,:nb_channels], 2, 0)[np.asarray([2,1,0,3])] # move channel axis + rgb
Beispiel #22
0
hdf_features_nocar_int.dims[0].label = 'batch'
hdf_features_nocar_int.dims[1].label = 'features_nocar_int'
hdf_labels.dims[0].label = 'batch'
hdf_labels.dims[1].label = 'labels'
hdf_cp.dims[0].label = 'batch'
hdf_cp.dims[1].label = 'codepostal'
hdf_hascar.dims[0].label = 'batch'
hdf_hascar.dims[1].label = 'hascar'

missing_codepostaux = []
missing_departements = []

for set_label, data in [('train', data_train), ('submit', data_submit)]:
    start_i = 0 if set_label == 'train' else len(data_train)

    with progress_bar(set_label, len(data)) as bar:
        for i, row in data.iterrows():
            # does the dude have a car ?
            has_car = row['marque'] != 'NR'
            hdf_hascar[start_i + i] = 1 if has_car else 0

            # categorical features
            feature_onehot_car_cat = numpy.zeros(total_uniques_car)
            feature_onehot_nocar_cat = numpy.zeros(total_uniques_nocar)
            for column_name in list_categorical:
                try:
                    if column_name in list_car:
                        feature_onehot_car_cat[
                            uniques_startindex_car[column_name] +
                            unique_to_index[column_name][row[column_name]]] = 1
                    else:
Beispiel #23
0
hdf_features.dims[0].label = 'batch'
hdf_features.dims[1].label = 'channel'
hdf_features.dims[2].label = 'height'
hdf_features.dims[3].label = 'width'
hdf_labels.dims[0].label = 'batch'
hdf_labels.dims[1].label = 'channel'
hdf_labels.dims[2].label = 'height'
hdf_labels.dims[3].label = 'width'

# Create matching for creating valid and train set
# range for selecting upper body joints
r = range(0, 8) + range(14, 20) + [26]

# build hdf5 train and submit
with progress_bar('train', n_examples_train) as bar:
    for j in range(n_examples_train):
        assert (re.findall(r'\d+', features_train_files[j]) == re.findall(
            r'\d+', labels_train_files[j]))
        hdf_features[j] = np.rollaxis(get_im_cv2(features_train_files[j]), 2)
        hdf_labels[j] = np.rollaxis(loadmat(labels_train_files[j])['map'],
                                    2)[r]
        bar.update(j)

with progress_bar('valid', n_examples_valid) as bar:
    for j in range(n_examples_valid):
        assert (re.findall(r'\d+', features_val_files[j]) == re.findall(
            r'\d+', labels_val_files[j]))
        hdf_features[n_examples_train + j] = np.rollaxis(
            get_im_cv2(features_val_files[j]), 2)
        hdf_labels[n_examples_train + j] = np.rollaxis(
Beispiel #24
0
def convert_captcha(directory, output_directory,
                         output_filename='captcha.hdf5'):
    """Converts captcha dataset to HDF5.

    Converts captcha to an HDF5 dataset compatible with
    :class:`fuel.datasets.captcha`. 
    The converted dataset is saved as 'captcha.hdf5'.

    It assumes the existence of the directory:  
        ./captcha/lineImages
        ./captcha/ascii-all

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'captcha.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    # Prepare input 
    all_example_paths = get_example_list(os.path.join(directory,'captcha','ans.txt'))
    split = "all"
    split_size = len(all_example_paths)
    
    # Prepare output file
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf_features = h5file.create_dataset('image_features', (split_size,),
                                         dtype=dtype)
    hdf_shapes = h5file.create_dataset('image_features_shapes', (split_size, 3),
                                       dtype='int32')
    hdf_targets = h5file.create_dataset('targets', (split_size,), dtype=h5py.special_dtype(vlen=bytes))
    hdf_targets_shapes = h5file.create_dataset('hdf_targets_shapes', (split_size, 1),
                                       dtype='int32')
    # Attach shape annotations and scales
    hdf_features.dims.create_scale(hdf_shapes, 'shapes')
    hdf_features.dims[0].attach_scale(hdf_shapes)

    hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels',
                                              (3,), dtype='S7')
    hdf_shapes_labels[...] = ['channel'.encode('utf8'),
                              'height'.encode('utf8'),
                              'width'.encode('utf8')]
    hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
    hdf_features.dims[0].attach_scale(hdf_shapes_labels)

    hdf_targets.dims.create_scale(hdf_targets_shapes, 'targets_shapes')
    hdf_targets.dims[0].attach_scale(hdf_targets_shapes)

    # Add axis annotations
    hdf_features.dims[0].label = 'batch'

    hdf_targets.dims[0].label = 'batch'
    #hdf_targets.dims[1].label = 'index'



    # Shuffle the examples
    rng = numpy.random.RandomState(123522)
    rng.shuffle(all_example_paths)

    # Convert from JPEG to NumPy arrays
    with progress_bar(split, split_size) as bar:
        i = 0
        for tag, image_path in all_example_paths:
            # Save image
            image = numpy.array(Image.open(image_path))
            #(height, width, channel) to (channel, height, width)
            image = image.transpose(2, 0, 1)
            print image.shape
            hdf_features[i] = image.flatten()
            hdf_shapes[i] = image.shape
            print image.shape

            #get the target
            textline = tag
            print textline
            hdf_targets[i] = textline #numpy.array(textline)
            hdf_targets_shapes[i] = len(textline)

            # Update progress
            i += 1
            bar.update(i)

    # Add the labels
    split_dict = {}
    sources = ['image_features', 'targets']
    split_dict['all'] = dict(zip(sources, [(0, split_size)] * 2))
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)


    h5file.flush()
    h5file.close()

    return (output_path,)
Beispiel #25
0
def convert_jpgtgz(target, onlytarget, directory, output_directory,
                 output_filename=None):
    """Converts jpg tar.gz dataset to HDF5.

    Converts a jpg tar.gz dataset to an HDF5 dataset.

    Parameters
    ----------
    target: bool
        Also addd a targets source to the file.
        The targets are computed based on train/test.target.csv.
        Each image receive two values [0] - is the target value and
        [1] - is a mask bit saying if a target is at all defined for the image
    onlytarget: bool
        same as target but dont take images that dont have a target value defined.
        there is no mask
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'jpg.hdf5'

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    dotarget = target | onlytarget
    # if onlytarget then there is just one value otherwise
    # this will have two values 0-target 1-mask saying if the target should be used
    target_dim = 1 if onlytarget else 2
    if not output_filename:
        output_filename = 'jpg.hdf5'

    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    try:
        TMPDIR = tempfile.mkdtemp()

        sources = ('features','targets') if dotarget else ('features',)
        source_dtypes = dict([(source, 'uint8') for source in sources])
        source_axis_labels = {
            'features': ('channel', 'height', 'width'),
            'targets': ('index',),
        }

        splits = ('train','test')
        file_paths = dict(zip(splits, FORMAT_1_FILES))
        for split, path in file_paths.items():
            file_paths[split] = os.path.join(directory, path)

        target_paths = dict(zip(splits, TARGET_FILES))
        for split, path in target_paths.items():
            target_paths[split] = os.path.join(directory, path)

        split_targets = {}
        for split in splits:
            try:
                targets = pd.read_csv(target_paths[split], index_col='name')
            except:
                targets = None
            split_targets[split] = targets

        def get_target(image_path, split):
            try:
                root_basename = os.path.splitext(os.path.basename(image_path))[0]
                target = split_targets[split].loc[root_basename].target
                mask = 1
            except:
                target = 0
                mask = 0
            return target, mask


        # We first extract the data files in a temporary directory. While doing
        # that, we also count the number of examples for each split. Files are
        # extracted individually, which allows to display a progress bar. Since
        # the splits will be concatenated in the HDF5 file, we also compute the
        # start and stop intervals of each split within the concatenated array.
        checksums = set([])
        def extract_tar(split):
            num_examples = 0
            path = file_paths[split]
            if os.path.isfile(path):
                with tarfile.open(path, 'r:gz') as f:
                    members = f.getmembers()
                    progress_bar_context = progress_bar(
                        name='{} file'.format(split), maxval=len(members),
                        prefix='Extracting')
                    with progress_bar_context as bar:
                        for i, member in enumerate(members):
                            if ((member.name.endswith('.jpg') and not
                            os.path.basename(member.name).startswith('.'))):
                                f.extract(member, path=os.path.join(TMPDIR,split))
                                num_examples += 1
                            bar.update(i)
                DIR = TMPDIR
            elif os.path.isdir(path):
                print("reading DIRECTORY %s"%path)
                DIR = path
                for root, dirs, files in os.walk(path):
                    for file in files:
                        if file.endswith('.jpg') and not file.startswith('.'):
                            num_examples += 1
            else:
                print("No file or directory named %s"%path)
                return [], None
            print('#files=%d'%num_examples)

            jpgfiles = []
            progress_bar_context = progress_bar(
                name='{} file'.format(split), maxval=num_examples,
                prefix='Validating')
            num_examples = 0
            bad_examples = 0
            duplicate_examples = 0
            count = 0
            errors = 0
            shape = None  # all images must have the same shape
            with progress_bar_context as bar:
                for root, dirs, files in os.walk(os.path.join(DIR,split)):
                    for file in files:
                        if file.endswith('.jpg') and not file.startswith('.'):
                            image_path = os.path.join(root, file)
                            count += 1
                            try:
                                im = Image.open(image_path)
                                im = numpy.asarray(im)
                                m = hashlib.md5()
                                m.update(im)
                                h = m.hexdigest()

                                if shape is None:
                                    shape = im.shape
                                if im.shape != shape:
                                    bad_examples += 1
                                    os.remove(image_path)
                                elif h  in checksums:
                                    duplicate_examples += 1
                                    os.remove(image_path)
                                else:
                                    checksums.add(h)
                                    num_examples += 1
                                    jpgfiles.append(image_path)
                            except:
                                errors += 1
                            bar.update(count)
            print('count=%d bad=%d dup=%d good=%d errors=%d'%(
                count, bad_examples, duplicate_examples,
                num_examples, errors))

            if onlytarget:
                jpgfiles = filter(lambda x: get_target(image_path, split)[1],
                                  jpgfiles)
            return jpgfiles, shape

        examples_per_split = OrderedDict(
            [(split, extract_tar(split)) for split in splits])
        cumulative_num_examples = numpy.cumsum(
            [0] + list(map(lambda x: len(x[0]),examples_per_split.values())))
        num_examples = cumulative_num_examples[-1]
        intervals = zip(cumulative_num_examples[:-1],
                        cumulative_num_examples[1:])
        split_intervals = dict(zip(splits, intervals))

        # The start and stop indices are used to create a split dict that will
        # be parsed into the split array required by the H5PYDataset interface.
        # The split dict is organized as follows:
        #
        #     dict(split -> dict(source -> (start, stop)))
        #
        split_dict = OrderedDict([
            (split, OrderedDict([(s, split_intervals[split])
                                 for s in sources]))
            for split in splits])
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        # We then prepare the HDF5 dataset. This involves creating datasets to
        # store data sources and datasets to store auxiliary information
        # (namely the shapes for variable-length axes, and labels to indicate
        # what these variable-length axes represent).
        def make_vlen_dataset(source, shape):
            dtype = source_dtypes[source]
            shape = (num_examples,)+shape
            print("creating %s %s %s"%(source,str(shape),str(dtype)))
            dataset = h5file.create_dataset(
                source, shape, dtype=dtype)
            # Tag fixed-length axis with its label
            dataset.dims[0].label = 'batch'
            for i, label in enumerate(source_axis_labels[source]):
                dataset.dims[i+1].label = label

        shapes = filter(None,map(lambda x: x[1],examples_per_split.values()))
        assert len(set(shapes)) == 1, "splits have different image size %s"%shapes
        print('Images shape %s'%str(shapes[0]))

        source_shape = {'features':shapes[0], 'targets':(target_dim,)}

        for source in sources:
            make_vlen_dataset(source, source_shape[source])

        # The final step is to fill the HDF5 file.
        def fill_split(split, bar=None):
            print(split)

            image_count = target_count = 0
            for image_number, image_path in enumerate(examples_per_split[split][0]):
                image = numpy.asarray(Image.open(image_path))
                index = image_number + split_intervals[split][0]

                h5file['features'][index] = image
                image_count += 1

                target, mask = get_target(image_path, split)

                if dotarget:
                    if onlytarget:
                        h5file['targets'][index] = numpy.array([target,])
                    else:
                        h5file['targets'][index] = numpy.array([target,mask])
                target_count += mask

                if image_number % 1000 == 0:
                    h5file.flush()
                if bar:
                    bar.update(index)
            print('# targets %d out of %d'%(target_count, image_count))
        with progress_bar('jpgtgz', num_examples) as bar:
            for split in splits:
                fill_split(split, bar=bar)
    finally:
        if os.path.isdir(TMPDIR):
            shutil.rmtree(TMPDIR)
        h5file.flush()
        h5file.close()

    return (output_path,)
Beispiel #26
0
        def extract_tar(split):
            num_examples = 0
            path = file_paths[split]
            if os.path.isfile(path):
                with tarfile.open(path, 'r:gz') as f:
                    members = f.getmembers()
                    progress_bar_context = progress_bar(
                        name='{} file'.format(split), maxval=len(members),
                        prefix='Extracting')
                    with progress_bar_context as bar:
                        for i, member in enumerate(members):
                            if ((member.name.endswith('.jpg') and not
                            os.path.basename(member.name).startswith('.'))):
                                f.extract(member, path=os.path.join(TMPDIR,split))
                                num_examples += 1
                            bar.update(i)
                DIR = TMPDIR
            elif os.path.isdir(path):
                print("reading DIRECTORY %s"%path)
                DIR = path
                for root, dirs, files in os.walk(path):
                    for file in files:
                        if file.endswith('.jpg') and not file.startswith('.'):
                            num_examples += 1
            else:
                print("No file or directory named %s"%path)
                return [], None
            print('#files=%d'%num_examples)

            jpgfiles = []
            progress_bar_context = progress_bar(
                name='{} file'.format(split), maxval=num_examples,
                prefix='Validating')
            num_examples = 0
            bad_examples = 0
            duplicate_examples = 0
            count = 0
            errors = 0
            shape = None  # all images must have the same shape
            with progress_bar_context as bar:
                for root, dirs, files in os.walk(os.path.join(DIR,split)):
                    for file in files:
                        if file.endswith('.jpg') and not file.startswith('.'):
                            image_path = os.path.join(root, file)
                            count += 1
                            try:
                                im = Image.open(image_path)
                                im = numpy.asarray(im)
                                m = hashlib.md5()
                                m.update(im)
                                h = m.hexdigest()

                                if shape is None:
                                    shape = im.shape
                                if im.shape != shape:
                                    bad_examples += 1
                                    os.remove(image_path)
                                elif h  in checksums:
                                    duplicate_examples += 1
                                    os.remove(image_path)
                                else:
                                    checksums.add(h)
                                    num_examples += 1
                                    jpgfiles.append(image_path)
                            except:
                                errors += 1
                            bar.update(count)
            print('count=%d bad=%d dup=%d good=%d errors=%d'%(
                count, bad_examples, duplicate_examples,
                num_examples, errors))

            if onlytarget:
                jpgfiles = filter(lambda x: get_target(image_path, split)[1],
                                  jpgfiles)
            return jpgfiles, shape
Beispiel #27
0
def convert_iam_ondb(directory, output_directory,
                         output_filename='iam_ondb.hdf5'):
    """Converts iam_ondb dataset to HDF5.

    Converts iam_ondb to an HDF5 dataset compatible with
    :class:`fuel.datasets.iam_ondb`. 
    The converted dataset is saved as 'iam_ondb.hdf5'.

    It assumes the existence of the directory:  
        ./iam_ondb/lineImages
        ./iam_ondb/ascii-all

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'iam_ondb.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    # Prepare input 
    all_image_paths = get_path_list(directory)
    total_num = len(all_image_paths)
    test_num = int(total_num/4.0)
    train_num = total_num - test_num




    # Prepare output file
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf_features = h5file.create_dataset('image_features', (total_num,),
                                         dtype=dtype)
    hdf_shapes = h5file.create_dataset('image_features_shapes', (total_num, 3),
                                       dtype='int32')
    hdf_labels = h5file.create_dataset('targets', (total_num,), dtype=h5py.special_dtype(vlen=bytes))
    hdf_labels_shapes = h5file.create_dataset('hdf_labels_shapes', (total_num, 1),
                                       dtype='int32')
    # Attach shape annotations and scales
    hdf_features.dims.create_scale(hdf_shapes, 'shapes')
    hdf_features.dims[0].attach_scale(hdf_shapes)

    hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels',
                                              (3,), dtype='S7')
    hdf_shapes_labels[...] = ['channel'.encode('utf8'),
                              'height'.encode('utf8'),
                              'width'.encode('utf8')]
    hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
    hdf_features.dims[0].attach_scale(hdf_shapes_labels)

    hdf_labels.dims.create_scale(hdf_labels_shapes, 'lables_shapes')
    hdf_labels.dims[0].attach_scale(hdf_labels_shapes)

    # Add axis annotations
    hdf_features.dims[0].label = 'batch'

    hdf_labels.dims[0].label = 'batch'
    #hdf_labels.dims[1].label = 'index'

    # Convert

    i = 0
    for split, split_size in zip(["train", "test"], [train_num, test_num]):
        # Shuffle the examples
        rng = numpy.random.RandomState(123522)
        if split=="train":
            image_paths = all_image_paths[0:train_num]
        else:
            image_paths = all_image_paths[train_num:]
        rng.shuffle(image_paths)

        # Convert from JPEG to NumPy arrays
        with progress_bar(split, split_size) as bar:
            for image_path in image_paths:
                # Save image
                image = numpy.array(Image.open(image_path))
                print image.shape
                if image.ndim!=2:
                    continue
                # Add a channels axis
                image = image[numpy.newaxis,:,:]
                
                hdf_features[i] = image.flatten()
                hdf_shapes[i] = image.shape
                print image.shape

                #get the target
                textline = get_target(image_path)
                print textline
                hdf_labels[i] = textline #numpy.array(textline)
                hdf_labels_shapes[i] = len(textline)

                # Update progress
                i += 1
                bar.update(i if split == "train" else i - train_num)

    # Add the labels
    split_dict = {}
    sources = ['image_features', 'targets']
    split_dict['train'] = dict(zip(sources, [(0, train_num)] * 2))
    split_dict['test'] = dict(zip(sources, [(train_num, total_num)] * 2))
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    # data = (('train', 'features', train_features),
    #         ('train', 'targets', train_labels),
    #         ('test', 'features', test_features),
    #         ('test', 'targets', test_labels))
    # fill_hdf5_file(h5file, data)
    # h5file['features'].dims[0].label = 'batch'
    # h5file['features'].dims[1].label = 'channel'
    # h5file['features'].dims[2].label = 'height'
    # h5file['features'].dims[3].label = 'width'

    # h5file['targets'].dims[0].label = 'batch'
    # h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Beispiel #28
0
def convert_iam_ondb(directory,
                     output_directory,
                     output_filename='iam_ondb.hdf5'):
    """Converts iam_ondb dataset to HDF5.

    Converts iam_ondb to an HDF5 dataset compatible with
    :class:`fuel.datasets.iam_ondb`. 
    The converted dataset is saved as 'iam_ondb.hdf5'.

    It assumes the existence of the directory:  
        ./iam_ondb/lineImages
        ./iam_ondb/ascii-all

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'iam_ondb.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    # Prepare input
    all_image_paths = get_path_list(directory)
    total_num = len(all_image_paths)
    test_num = int(total_num / 4.0)
    train_num = total_num - test_num

    # Prepare output file
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf_features = h5file.create_dataset('image_features', (total_num, ),
                                         dtype=dtype)
    hdf_shapes = h5file.create_dataset('image_features_shapes', (total_num, 3),
                                       dtype='int32')
    hdf_labels = h5file.create_dataset('targets', (total_num, ),
                                       dtype=h5py.special_dtype(vlen=bytes))
    hdf_labels_shapes = h5file.create_dataset('hdf_labels_shapes',
                                              (total_num, 1),
                                              dtype='int32')
    # Attach shape annotations and scales
    hdf_features.dims.create_scale(hdf_shapes, 'shapes')
    hdf_features.dims[0].attach_scale(hdf_shapes)

    hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels',
                                              (3, ),
                                              dtype='S7')
    hdf_shapes_labels[...] = [
        'channel'.encode('utf8'), 'height'.encode('utf8'),
        'width'.encode('utf8')
    ]
    hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels')
    hdf_features.dims[0].attach_scale(hdf_shapes_labels)

    hdf_labels.dims.create_scale(hdf_labels_shapes, 'lables_shapes')
    hdf_labels.dims[0].attach_scale(hdf_labels_shapes)

    # Add axis annotations
    hdf_features.dims[0].label = 'batch'

    hdf_labels.dims[0].label = 'batch'
    #hdf_labels.dims[1].label = 'index'

    # Convert

    i = 0
    for split, split_size in zip(["train", "test"], [train_num, test_num]):
        # Shuffle the examples
        rng = numpy.random.RandomState(123522)
        if split == "train":
            image_paths = all_image_paths[0:train_num]
        else:
            image_paths = all_image_paths[train_num:]
        rng.shuffle(image_paths)

        # Convert from JPEG to NumPy arrays
        with progress_bar(split, split_size) as bar:
            for image_path in image_paths:
                # Save image
                image = numpy.array(Image.open(image_path))
                print image.shape
                if image.ndim != 2:
                    continue
                # Add a channels axis
                image = image[numpy.newaxis, :, :]

                hdf_features[i] = image.flatten()
                hdf_shapes[i] = image.shape
                print image.shape

                #get the target
                textline = get_target(image_path)
                print textline
                hdf_labels[i] = textline  #numpy.array(textline)
                hdf_labels_shapes[i] = len(textline)

                # Update progress
                i += 1
                bar.update(i if split == "train" else i - train_num)

    # Add the labels
    split_dict = {}
    sources = ['image_features', 'targets']
    split_dict['train'] = dict(zip(sources, [(0, train_num)] * 2))
    split_dict['test'] = dict(zip(sources, [(train_num, total_num)] * 2))
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    # data = (('train', 'features', train_features),
    #         ('train', 'targets', train_labels),
    #         ('test', 'features', test_features),
    #         ('test', 'targets', test_labels))
    # fill_hdf5_file(h5file, data)
    # h5file['features'].dims[0].label = 'batch'
    # h5file['features'].dims[1].label = 'channel'
    # h5file['features'].dims[2].label = 'height'
    # h5file['features'].dims[3].label = 'width'

    # h5file['targets'].dims[0].label = 'batch'
    # h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path, )