def image_consumer(socket, hdf5_file, num_expected, shuffle_seed=None, offset=0): """Fill an HDF5 file with incoming images from a socket. Parameters ---------- socket : :class:`zmq.Socket` PULL socket on which to receive images. hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. Assumes `features`, `targets` and `filenames` already exist and have first dimension larger than `sum(images_per_class)`. num_expected : int The number of items we expect to be sent over the socket. shuffle_seed : int or sequence, optional Seed for a NumPy random number generator that permutes the images on disk. offset : int, optional The offset in the HDF5 datasets at which to start writing received examples. Defaults to 0. """ with progress_bar('images', maxval=num_expected) as pb: if shuffle_seed is None: index_gen = iter(xrange(num_expected)) else: rng = numpy.random.RandomState(shuffle_seed) index_gen = iter(rng.permutation(num_expected)) for i, num in enumerate(index_gen): image_filename, class_index = socket.recv_pyobj(zmq.SNDMORE) image_data = numpy.fromstring(socket.recv(), dtype='uint8') _write_to_hdf5(hdf5_file, num + offset, image_filename, image_data, class_index) pb.update(i + 1)
def image_consumer(socket, hdf5_file, num_expected, shuffle_seed=None, offset=0): """Fill an HDF5 file with incoming images from a socket. Parameters ---------- socket : :class:`zmq.Socket` PULL socket on which to receive images. hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. Assumes `features`, `targets` and `filenames` already exist and have first dimension larger than `sum(images_per_class)`. num_expected : int The number of items we expect to be sent over the socket. shuffle_seed : int or sequence, optional Seed for a NumPy random number generator that permutes the images on disk. offset : int, optional The offset in the HDF5 datasets at which to start writing received examples. Defaults to 0. """ with progress_bar('images', maxval=num_expected) as pb: if shuffle_seed is None: index_gen = iter(xrange(num_expected)) else: rng = numpy.random.RandomState(shuffle_seed) index_gen = iter(rng.permutation(num_expected)) for i, num in enumerate(index_gen): image_filename, class_index = socket.recv_pyobj(zmq.SNDMORE) image_data = numpy.fromstring(socket.recv(), dtype='uint8') _write_to_hdf5(hdf5_file, num + offset, image_filename, image_data, class_index) pb.update(i + 1)
def get_boxes(split): boxes = [] with h5py.File(digit_struct_paths[split], 'r') as f: bar_name = '{} digitStruct'.format(split) bar_maxval = examples_per_split[split] with progress_bar(bar_name, bar_maxval) as bar: for image_number in range(examples_per_split[split]): # The 'digitStruct' group is the main group of the HDF5 # file. It contains two datasets: 'bbox' and 'name'. # The 'name' dataset isn't of interest to us, as it # stores file names and there's already a one-to-one # mapping between row numbers and image names (e.g. # row 0 corresponds to '1.png', row 1 corresponds to # '2.png', and so on). main_group = f['digitStruct'] # The 'bbox' dataset contains the bounding box and # label information we're after. It has as many rows # as there are images, and one column. Elements of the # 'bbox' dataset are object references that point to # (yet another) group that contains the information # for the corresponding image. image_reference = main_group['bbox'][image_number, 0] # There are five datasets contained in that group: # 'label', 'height', 'width', 'left' and 'top'. Each of # those datasets has as many rows as there are bounding # boxes in the corresponding image, and one column. def get_dataset(name): return main_group[image_reference][name][:, 0] names = ('label', 'height', 'width', 'left', 'top') datasets = dict([(name, get_dataset(name)) for name in names]) # If there is only one bounding box, the information is # stored directly in the datasets. If there are # multiple bounding boxes, elements of those datasets # are object references pointing to 1x1 datasets that # store the information (fortunately, it's the last # hop we need to make). def get_elements(dataset): if len(dataset) > 1: return [ int(main_group[reference][0, 0]) for reference in dataset ] else: return [int(dataset[0])] # Names are pluralized in the BoundingBox named tuple. kwargs = dict([(name + 's', get_elements(dataset)) for name, dataset in iteritems(datasets) ]) boxes.append(BoundingBoxes(**kwargs)) if bar: bar.update(image_number) return boxes
def load_images(split, tar, basename, rows): image_list = [] progress_bar_context = progress_bar(name='{} images'.format(split), maxval=len(rows), prefix='Converting') with progress_bar_context as bar: for i, row in enumerate(rows): image_list.append(loadImagePairFromRow(tar, basename, row)) bar.update(i) return np.array(image_list)
def load_images(split, tar, basename, rows): image_list = [] progress_bar_context = progress_bar( name='{} images'.format(split), maxval=len(rows), prefix='Converting') with progress_bar_context as bar: for i, row in enumerate(rows): image_list.append(loadImagePairFromRow(tar, basename, row)) bar.update(i) return np.array(image_list)
def convert_celeba_128(directory, output_directory, output_filename='celeba_128.hdf5'): """Converts the 128x128 version of the CelebA dataset to HDF5. This converter takes the aligned and cropped version of the CelebA dataset as input and produces a version that's been resized to 78x64 pixels and then center cropped to 128x128 pixels. Converts the CelebA dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CelebA`. The converted dataset is saved as 'celeba_128.hdf5'. It assumes the existence of the following files: * `img_align_celeba.zip` * `list_attr_celeba.txt` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'celeba_64.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = _initialize_conversion(directory, output_path, (128, 128)) features_dataset = h5file['features'] image_file_path = os.path.join(directory, IMAGE_FILE) with zipfile.ZipFile(image_file_path, 'r') as image_file: with progress_bar('images', NUM_EXAMPLES) as bar: for i in range(NUM_EXAMPLES): image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1) image = Image.open(image_file.open(image_name, 'r')).resize( (128, 128 + 7 * 4), Image.ANTIALIAS).crop( (0, 7 * 2, 128, 128 + 7 * 2)) features_dataset[i] = numpy.asarray(image).transpose(2, 0, 1) bar.update(i + 1) h5file.flush() h5file.close() return (output_path, )
def get_boxes(split): boxes = [] with h5py.File(digit_struct_paths[split], 'r') as f: bar_name = '{} digitStruct'.format(split) bar_maxval = examples_per_split[split] with progress_bar(bar_name, bar_maxval) as bar: for image_number in range(examples_per_split[split]): # The 'digitStruct' group is the main group of the HDF5 # file. It contains two datasets: 'bbox' and 'name'. # The 'name' dataset isn't of interest to us, as it # stores file names and there's already a one-to-one # mapping between row numbers and image names (e.g. # row 0 corresponds to '1.png', row 1 corresponds to # '2.png', and so on). main_group = f['digitStruct'] # The 'bbox' dataset contains the bounding box and # label information we're after. It has as many rows # as there are images, and one column. Elements of the # 'bbox' dataset are object references that point to # (yet another) group that contains the information # for the corresponding image. image_reference = main_group['bbox'][image_number, 0] # There are five datasets contained in that group: # 'label', 'height', 'width', 'left' and 'top'. Each of # those datasets has as many rows as there are bounding # boxes in the corresponding image, and one column. def get_dataset(name): return main_group[image_reference][name][:, 0] names = ('label', 'height', 'width', 'left', 'top') datasets = dict( [(name, get_dataset(name)) for name in names]) # If there is only one bounding box, the information is # stored directly in the datasets. If there are # multiple bounding boxes, elements of those datasets # are object references pointing to 1x1 datasets that # store the information (fortunately, it's the last # hop we need to make). def get_elements(dataset): if len(dataset) > 1: return [int(main_group[reference][0, 0]) for reference in dataset] else: return [int(dataset[0])] # Names are pluralized in the BoundingBox named tuple. kwargs = dict( [(name + 's', get_elements(dataset)) for name, dataset in iteritems(datasets)]) boxes.append(BoundingBoxes(**kwargs)) if bar: bar.update(image_number) return boxes
def extract_tar(split): with tarfile.open(file_paths[split], 'r:gz') as f: members = f.getmembers() num_examples = sum(1 for m in members if '.png' in m.name) progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): f.extract(member, path=TMPDIR) bar.update(i) return num_examples
def extract_tar(split): with tarfile.open(file_paths[split], 'r:gz') as f: members = f.getmembers() num_examples = sum(1 for m in members if '.png' in m.name) progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): f.extract(member, path=TMPDIR) bar.update(i) return num_examples
def convert_celeba_64(directory, output_directory, output_filename='celeba_64.hdf5'): """Converts the 64x64 version of the CelebA dataset to HDF5. This converter takes the aligned and cropped version of the CelebA dataset as input and produces a version that's been resized to 78x64 pixels and then center cropped to 64x64 pixels. Converts the CelebA dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CelebA`. The converted dataset is saved as 'celeba_64.hdf5'. It assumes the existence of the following files: * `img_align_celeba.zip` * `list_attr_celeba.txt` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'celeba_64.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = _initialize_conversion(directory, output_path, (64, 64)) features_dataset = h5file['features'] image_file_path = os.path.join(directory, IMAGE_FILE) with zipfile.ZipFile(image_file_path, 'r') as image_file: with progress_bar('images', NUM_EXAMPLES) as bar: for i in range(NUM_EXAMPLES): image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1) image = Image.open( image_file.open(image_name, 'r')).resize( (64, 78), Image.ANTIALIAS).crop((0, 7, 64, 64 + 7)) features_dataset[i] = numpy.asarray(image).transpose(2, 0, 1) bar.update(i + 1) h5file.flush() h5file.close() return (output_path,)
def convert_celeba_aligned_cropped(directory, output_directory, output_filename=OUTPUT_FILENAME): """Converts the aligned and cropped CelebA dataset to HDF5. Converts the CelebA dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CelebA`. The converted dataset is saved as 'celeba_aligned_cropped.hdf5'. It assumes the existence of the following files: * `img_align_celeba.zip` * `list_attr_celeba.txt` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'celeba_aligned_cropped.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = _initialize_conversion(directory, output_path, (218, 178)) features_dataset = h5file['features'] image_file_path = os.path.join(directory, IMAGE_FILE) with zipfile.ZipFile(image_file_path, 'r') as image_file: with progress_bar('images', NUM_EXAMPLES) as bar: for i in range(NUM_EXAMPLES): image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1) features_dataset[i] = numpy.asarray( Image.open(image_file.open(image_name, 'r'))).transpose(2, 0, 1) bar.update(i + 1) h5file.flush() h5file.close() return (output_path, )
def convert_celeba_aligned_cropped(directory, output_directory, output_filename=OUTPUT_FILENAME): """Converts the aligned and cropped CelebA dataset to HDF5. Converts the CelebA dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CelebA`. The converted dataset is saved as 'celeba_aligned_cropped.hdf5'. It assumes the existence of the following files: * `img_align_celeba.zip` * `list_attr_celeba.txt` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'celeba_aligned_cropped.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = _initialize_conversion(directory, output_path, (218, 178)) features_dataset = h5file['features'] image_file_path = os.path.join(directory, IMAGE_FILE) with zipfile.ZipFile(image_file_path, 'r') as image_file: with progress_bar('images', NUM_EXAMPLES) as bar: for i in range(NUM_EXAMPLES): image_name = 'img_align_celeba/{:06d}.jpg'.format(i + 1) features_dataset[i] = numpy.asarray( Image.open( image_file.open(image_name, 'r'))).transpose(2, 0, 1) bar.update(i + 1) h5file.flush() h5file.close() return (output_path,)
def convert_camvid(directory, output_directory, output_filename='camvid.hdf5'): """Converts the camvid dataset to HDF5. Converts the camvid dataset to an HDF5 dataset compatible with :class:`fuel.datasets.camvid`. The converted dataset is saved as 'camvid.hdf5'. Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'camvid_aligned_cropped.hdf5' or 'camvid_64.hdf5', depending on `which_format`. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = _initialize_conversion(directory, output_path, (360, 480)) image_file_path = os.path.join(directory, DATASET_FILE) features_dataset = h5file['features'] targets_dataset = h5file['targets'] with zipfile.ZipFile(image_file_path, 'r'): with progress_bar('images', NUM_EXAMPLES) as bar: for files in DATASET_FILES: open_file = open(files, 'r') for i, line in enumerate(open_file): image_name, target_name = line.split() image = Image.open(image_name[15:], 'r') target = Image.open(target_name[15:], 'r') features_dataset[i] = numpy.asarray(image).transpose( 2, 0, 1) targets_dataset[i] = numpy.asarray(target) bar.update(i + 1) h5file.flush() h5file.close() return (output_path, )
def convert_svhn_format_1(directory, output_directory, output_filename='svhn_format_1.hdf5'): """Converts the SVHN dataset (format 1) to HDF5. This method assumes the existence of the files `{train,test,extra}.tar.gz`, which are accessible through the official website [SVHNSITE]. .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/ Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ try: output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') TMPDIR = tempfile.mkdtemp() # Every image has three channels (RGB) and variable height and width. # It features a variable number of bounding boxes that identify the # location and label of digits. The bounding box location is specified # using the x and y coordinates of its top left corner along with its # width and height. BoundingBoxes = namedtuple( 'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops']) sources = ('features', ) + tuple('bbox_{}'.format(field) for field in BoundingBoxes._fields) source_dtypes = dict([(source, 'uint8') for source in sources[:2]] + [(source, 'uint16') for source in sources[2:]]) source_axis_labels = { 'features': ('channel', 'height', 'width'), 'bbox_labels': ('bounding_box', 'index'), 'bbox_heights': ('bounding_box', 'height'), 'bbox_widths': ('bounding_box', 'width'), 'bbox_lefts': ('bounding_box', 'x'), 'bbox_tops': ('bounding_box', 'y') } # The dataset is split into three sets: the training set, the test set # and an extra set of examples that are somewhat less difficult but # can be used as extra training data. These sets are stored separately # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file # contains a directory named after the split it stores. The examples # are stored in that directory as PNG images. The directory also # contains a 'digitStruct.mat' file with all the bounding box and # label information. splits = ('train', 'test', 'extra') file_paths = dict(zip(splits, FORMAT_1_FILES)) for split, path in file_paths.items(): file_paths[split] = os.path.join(directory, path) digit_struct_paths = dict([(split, os.path.join(TMPDIR, split, 'digitStruct.mat')) for split in splits]) # We first extract the data files in a temporary directory. While doing # that, we also count the number of examples for each split. Files are # extracted individually, which allows to display a progress bar. Since # the splits will be concatenated in the HDF5 file, we also compute the # start and stop intervals of each split within the concatenated array. def extract_tar(split): with tarfile.open(file_paths[split], 'r:gz') as f: members = f.getmembers() num_examples = sum(1 for m in members if '.png' in m.name) progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): f.extract(member, path=TMPDIR) bar.update(i) return num_examples examples_per_split = OrderedDict([(split, extract_tar(split)) for split in splits]) cumulative_num_examples = numpy.cumsum( [0] + list(examples_per_split.values())) num_examples = cumulative_num_examples[-1] intervals = zip(cumulative_num_examples[:-1], cumulative_num_examples[1:]) split_intervals = dict(zip(splits, intervals)) # The start and stop indices are used to create a split dict that will # be parsed into the split array required by the H5PYDataset interface. # The split dict is organized as follows: # # dict(split -> dict(source -> (start, stop))) # split_dict = OrderedDict([(split, OrderedDict([(s, split_intervals[split]) for s in sources])) for split in splits]) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) # We then prepare the HDF5 dataset. This involves creating datasets to # store data sources and datasets to store auxiliary information # (namely the shapes for variable-length axes, and labels to indicate # what these variable-length axes represent). def make_vlen_dataset(source): # Create a variable-length 1D dataset dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source])) dataset = h5file.create_dataset(source, (num_examples, ), dtype=dtype) # Create a dataset to store variable-length shapes. axis_labels = source_axis_labels[source] dataset_shapes = h5file.create_dataset( '{}_shapes'.format(source), (num_examples, len(axis_labels)), dtype='uint16') # Create a dataset to store labels for variable-length axes. dataset_vlen_axis_labels = h5file.create_dataset( '{}_vlen_axis_labels'.format(source), (len(axis_labels), ), dtype='S{}'.format( numpy.max([len(label) for label in axis_labels]))) # Fill variable-length axis labels dataset_vlen_axis_labels[...] = [ label.encode('utf8') for label in axis_labels ] # Attach auxiliary datasets as dimension scales of the # variable-length 1D dataset. This is in accordance with the # H5PYDataset interface. dataset.dims.create_scale(dataset_shapes, 'shapes') dataset.dims[0].attach_scale(dataset_shapes) dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels') dataset.dims[0].attach_scale(dataset_vlen_axis_labels) # Tag fixed-length axis with its label dataset.dims[0].label = 'batch' for source in sources: make_vlen_dataset(source) # The "fun" part begins: we extract the bounding box and label # information contained in 'digitStruct.mat'. This is a version 7.3 # Matlab file, which uses HDF5 under the hood, albeit with a very # convoluted layout. def get_boxes(split): boxes = [] with h5py.File(digit_struct_paths[split], 'r') as f: bar_name = '{} digitStruct'.format(split) bar_maxval = examples_per_split[split] with progress_bar(bar_name, bar_maxval) as bar: for image_number in range(examples_per_split[split]): # The 'digitStruct' group is the main group of the HDF5 # file. It contains two datasets: 'bbox' and 'name'. # The 'name' dataset isn't of interest to us, as it # stores file names and there's already a one-to-one # mapping between row numbers and image names (e.g. # row 0 corresponds to '1.png', row 1 corresponds to # '2.png', and so on). main_group = f['digitStruct'] # The 'bbox' dataset contains the bounding box and # label information we're after. It has as many rows # as there are images, and one column. Elements of the # 'bbox' dataset are object references that point to # (yet another) group that contains the information # for the corresponding image. image_reference = main_group['bbox'][image_number, 0] # There are five datasets contained in that group: # 'label', 'height', 'width', 'left' and 'top'. Each of # those datasets has as many rows as there are bounding # boxes in the corresponding image, and one column. def get_dataset(name): return main_group[image_reference][name][:, 0] names = ('label', 'height', 'width', 'left', 'top') datasets = dict([(name, get_dataset(name)) for name in names]) # If there is only one bounding box, the information is # stored directly in the datasets. If there are # multiple bounding boxes, elements of those datasets # are object references pointing to 1x1 datasets that # store the information (fortunately, it's the last # hop we need to make). def get_elements(dataset): if len(dataset) > 1: return [ int(main_group[reference][0, 0]) for reference in dataset ] else: return [int(dataset[0])] # Names are pluralized in the BoundingBox named tuple. kwargs = dict([(name + 's', get_elements(dataset)) for name, dataset in iteritems(datasets) ]) boxes.append(BoundingBoxes(**kwargs)) if bar: bar.update(image_number) return boxes split_boxes = dict([(split, get_boxes(split)) for split in splits]) # The final step is to fill the HDF5 file. def fill_split(split, bar=None): for image_number in range(examples_per_split[split]): image_path = os.path.join(TMPDIR, split, '{}.png'.format(image_number + 1)) image = numpy.asarray(Image.open(image_path)).transpose( 2, 0, 1) bounding_boxes = split_boxes[split][image_number] num_boxes = len(bounding_boxes.labels) index = image_number + split_intervals[split][0] h5file['features'][index] = image.flatten() h5file['features'].dims[0]['shapes'][index] = image.shape for field in BoundingBoxes._fields: name = 'bbox_{}'.format(field) h5file[name][index] = numpy.maximum( 0, getattr(bounding_boxes, field)) h5file[name].dims[0]['shapes'][index] = [num_boxes, 1] # Replace label '10' with '0'. labels = h5file['bbox_labels'][index] labels[labels == 10] = 0 h5file['bbox_labels'][index] = labels if image_number % 1000 == 0: h5file.flush() if bar: bar.update(index) with progress_bar('SVHN format 1', num_examples) as bar: for split in splits: fill_split(split, bar=bar) finally: if os.path.isdir(TMPDIR): shutil.rmtree(TMPDIR) h5file.flush() h5file.close() return (output_path, )
def convert_svhn_format_1(directory, output_directory, output_filename='svhn_format_1.hdf5'): """Converts the SVHN dataset (format 1) to HDF5. This method assumes the existence of the files `{train,test,extra}.tar.gz`, which are accessible through the official website [SVHNSITE]. .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/ Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ try: output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') TMPDIR = tempfile.mkdtemp() # Every image has three channels (RGB) and variable height and width. # It features a variable number of bounding boxes that identify the # location and label of digits. The bounding box location is specified # using the x and y coordinates of its top left corner along with its # width and height. BoundingBoxes = namedtuple( 'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops']) sources = ('features',) + tuple('bbox_{}'.format(field) for field in BoundingBoxes._fields) source_dtypes = dict([(source, 'uint8') for source in sources[:2]] + [(source, 'uint16') for source in sources[2:]]) source_axis_labels = { 'features': ('channel', 'height', 'width'), 'bbox_labels': ('bounding_box', 'index'), 'bbox_heights': ('bounding_box', 'height'), 'bbox_widths': ('bounding_box', 'width'), 'bbox_lefts': ('bounding_box', 'x'), 'bbox_tops': ('bounding_box', 'y')} # The dataset is split into three sets: the training set, the test set # and an extra set of examples that are somewhat less difficult but # can be used as extra training data. These sets are stored separately # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file # contains a directory named after the split it stores. The examples # are stored in that directory as PNG images. The directory also # contains a 'digitStruct.mat' file with all the bounding box and # label information. splits = ('train', 'test', 'extra') file_paths = dict(zip(splits, FORMAT_1_FILES)) for split, path in file_paths.items(): file_paths[split] = os.path.join(directory, path) digit_struct_paths = dict( [(split, os.path.join(TMPDIR, split, 'digitStruct.mat')) for split in splits]) # We first extract the data files in a temporary directory. While doing # that, we also count the number of examples for each split. Files are # extracted individually, which allows to display a progress bar. Since # the splits will be concatenated in the HDF5 file, we also compute the # start and stop intervals of each split within the concatenated array. def extract_tar(split): with tarfile.open(file_paths[split], 'r:gz') as f: members = f.getmembers() num_examples = sum(1 for m in members if '.png' in m.name) progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): f.extract(member, path=TMPDIR) bar.update(i) return num_examples examples_per_split = OrderedDict( [(split, extract_tar(split)) for split in splits]) cumulative_num_examples = numpy.cumsum( [0] + list(examples_per_split.values())) num_examples = cumulative_num_examples[-1] intervals = zip(cumulative_num_examples[:-1], cumulative_num_examples[1:]) split_intervals = dict(zip(splits, intervals)) # The start and stop indices are used to create a split dict that will # be parsed into the split array required by the H5PYDataset interface. # The split dict is organized as follows: # # dict(split -> dict(source -> (start, stop))) # split_dict = OrderedDict([ (split, OrderedDict([(s, split_intervals[split]) for s in sources])) for split in splits]) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) # We then prepare the HDF5 dataset. This involves creating datasets to # store data sources and datasets to store auxiliary information # (namely the shapes for variable-length axes, and labels to indicate # what these variable-length axes represent). def make_vlen_dataset(source): # Create a variable-length 1D dataset dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source])) dataset = h5file.create_dataset( source, (num_examples,), dtype=dtype) # Create a dataset to store variable-length shapes. axis_labels = source_axis_labels[source] dataset_shapes = h5file.create_dataset( '{}_shapes'.format(source), (num_examples, len(axis_labels)), dtype='uint16') # Create a dataset to store labels for variable-length axes. dataset_vlen_axis_labels = h5file.create_dataset( '{}_vlen_axis_labels'.format(source), (len(axis_labels),), dtype='S{}'.format( numpy.max([len(label) for label in axis_labels]))) # Fill variable-length axis labels dataset_vlen_axis_labels[...] = [ label.encode('utf8') for label in axis_labels] # Attach auxiliary datasets as dimension scales of the # variable-length 1D dataset. This is in accordance with the # H5PYDataset interface. dataset.dims.create_scale(dataset_shapes, 'shapes') dataset.dims[0].attach_scale(dataset_shapes) dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels') dataset.dims[0].attach_scale(dataset_vlen_axis_labels) # Tag fixed-length axis with its label dataset.dims[0].label = 'batch' for source in sources: make_vlen_dataset(source) # The "fun" part begins: we extract the bounding box and label # information contained in 'digitStruct.mat'. This is a version 7.3 # Matlab file, which uses HDF5 under the hood, albeit with a very # convoluted layout. def get_boxes(split): boxes = [] with h5py.File(digit_struct_paths[split], 'r') as f: bar_name = '{} digitStruct'.format(split) bar_maxval = examples_per_split[split] with progress_bar(bar_name, bar_maxval) as bar: for image_number in range(examples_per_split[split]): # The 'digitStruct' group is the main group of the HDF5 # file. It contains two datasets: 'bbox' and 'name'. # The 'name' dataset isn't of interest to us, as it # stores file names and there's already a one-to-one # mapping between row numbers and image names (e.g. # row 0 corresponds to '1.png', row 1 corresponds to # '2.png', and so on). main_group = f['digitStruct'] # The 'bbox' dataset contains the bounding box and # label information we're after. It has as many rows # as there are images, and one column. Elements of the # 'bbox' dataset are object references that point to # (yet another) group that contains the information # for the corresponding image. image_reference = main_group['bbox'][image_number, 0] # There are five datasets contained in that group: # 'label', 'height', 'width', 'left' and 'top'. Each of # those datasets has as many rows as there are bounding # boxes in the corresponding image, and one column. def get_dataset(name): return main_group[image_reference][name][:, 0] names = ('label', 'height', 'width', 'left', 'top') datasets = dict( [(name, get_dataset(name)) for name in names]) # If there is only one bounding box, the information is # stored directly in the datasets. If there are # multiple bounding boxes, elements of those datasets # are object references pointing to 1x1 datasets that # store the information (fortunately, it's the last # hop we need to make). def get_elements(dataset): if len(dataset) > 1: return [int(main_group[reference][0, 0]) for reference in dataset] else: return [int(dataset[0])] # Names are pluralized in the BoundingBox named tuple. kwargs = dict( [(name + 's', get_elements(dataset)) for name, dataset in iteritems(datasets)]) boxes.append(BoundingBoxes(**kwargs)) if bar: bar.update(image_number) return boxes split_boxes = dict([(split, get_boxes(split)) for split in splits]) # The final step is to fill the HDF5 file. def fill_split(split, bar=None): for image_number in range(examples_per_split[split]): image_path = os.path.join( TMPDIR, split, '{}.png'.format(image_number + 1)) image = numpy.asarray( Image.open(image_path)).transpose(2, 0, 1) bounding_boxes = split_boxes[split][image_number] num_boxes = len(bounding_boxes.labels) index = image_number + split_intervals[split][0] h5file['features'][index] = image.flatten() h5file['features'].dims[0]['shapes'][index] = image.shape for field in BoundingBoxes._fields: name = 'bbox_{}'.format(field) h5file[name][index] = getattr(bounding_boxes, field) h5file[name].dims[0]['shapes'][index] = [num_boxes, 1] # Replace label '10' with '0'. labels = h5file['bbox_labels'][index] labels[labels == 10] = 0 h5file['bbox_labels'][index] = labels if image_number % 1000 == 0: h5file.flush() if bar: bar.update(index) with progress_bar('SVHN format 1', num_examples) as bar: for split in splits: fill_split(split, bar=bar) finally: if os.path.isdir(TMPDIR): shutil.rmtree(TMPDIR) h5file.flush() h5file.close() return (output_path,)
# output_case.append(d[0]) # output.append(output_case) # return output index_list = [] index = 1 index_list.append(index) images_output = [] multiplier_output = [] cases_output = [] output_ind = [] sax_indexes = [] sax_indexes_tmp = [] positions = [] positions_tmp = [] i = 0 with progress_bar('train', n_examples_train) as bar: for sequence in train_features: stri = sequence[0] m = re.search('train/(.+?)/study', stri) case_index = int(m.group(1)) if case_index != index: sax_indexes.append(list(numpy.unique(numpy.array(sax_indexes_tmp)))) cases_output.append(index) images_output.append(output_ind) output_ind = [] sax_indexes_tmp = [] positions.append(positions_tmp) positions_tmp = [] index = case_index index_list.append(index) multiplier_output.append(multiplier)
def convert_dogs_vs_cats(directory, output_directory, output_filename='dogs_vs_cats.hdf5'): """Converts the Dogs vs. Cats dataset to HDF5. Converts the Dogs vs. Cats dataset to an HDF5 dataset compatible with :class:`fuel.datasets.dogs_vs_cats`. The converted dataset is saved as 'dogs_vs_cats.hdf5'. It assumes the existence of the following files: * `dogs_vs_cats.train.zip` * `dogs_vs_cats.test1.zip` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'dogs_vs_cats.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ # Prepare output file output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf_features = h5file.create_dataset('image_features', (37500, ), dtype=dtype) hdf_shapes = h5file.create_dataset('image_features_shapes', (37500, 3), dtype='int32') hdf_labels = h5file.create_dataset('targets', (37500, 1), dtype='uint8') # Attach shape annotations and scales hdf_features.dims.create_scale(hdf_shapes, 'shapes') hdf_features.dims[0].attach_scale(hdf_shapes) hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3, ), dtype='S7') hdf_shapes_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8') ] hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_features.dims[0].attach_scale(hdf_shapes_labels) # Add axis annotations hdf_features.dims[0].label = 'batch' hdf_labels.dims[0].label = 'batch' hdf_labels.dims[1].label = 'index' # Convert i = 0 for split, split_size in zip([TRAIN, TEST], [25000, 12500]): # Open the ZIP file filename = os.path.join(directory, split) zip_file = zipfile.ZipFile(filename, 'r') image_names = zip_file.namelist()[1:] # Discard the directory name # Shuffle the examples rng = numpy.random.RandomState(123522) rng.shuffle(image_names) # Convert from JPEG to NumPy arrays with progress_bar(filename, split_size) as bar: for image_name in image_names: # Save image image = numpy.array(Image.open(zip_file.open(image_name))) image = image.transpose(2, 0, 1) hdf_features[i] = image.flatten() hdf_shapes[i] = image.shape # Cats are 0, Dogs are 1 hdf_labels[i] = 0 if 'cat' in image_name else 1 # Update progress i += 1 bar.update(i if split == TRAIN else i - 25000) # Add the labels split_dict = {} sources = ['image_features', 'targets'] for name, slice_ in zip(['train', 'test'], [(0, 25000), (25000, 37500)]): split_dict[name] = dict(zip(sources, [slice_] * len(sources))) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return (output_path, )
hdf_images.dims[1].label = 'height' hdf_images.dims[2].label = 'width' hdf_images.dims[3].label = 'channels' hdf_driver_id.dims[0].label = 'batch' hdf_labels.dims[0].label = 'batch' # Create matching for creating valid and train set validation_set_indexes_list = list(validation_set_indexes) indexes = range(n_examples_train) for ind in validation_set_indexes_list: indexes.remove(ind) indexes = indexes + validation_set_indexes_list assert (len(indexes) == n_examples_train) # build hdf5 train and submit with progress_bar('train', n_examples_train) as bar: for j in range(n_examples_train): hdf_images[j] = numpy.rollaxis(X_train[indexes[j]], 2, 0) hdf_labels[j] = y_train[indexes[j]] hdf_driver_id[j] = int(driver_id[indexes[j]][1:]) bar.update(j) with progress_bar('submit', n_examples_submit) as bar: for j in range(n_examples_submit): hdf_images[n_examples_train + j] = numpy.rollaxis(X_test[j], 2, 0) hdf_driver_id[n_examples_train + j] = int( X_test_id[j].split('_')[1][:-4]) bar.update(j) # Save hdf5 train and submit split_dict = {}
hdf_labels.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_labels.dims[0].attach_scale(hdf_shapes_labels) # Add axis annotations hdf_features.dims[0].label = 'batch' hdf_labels.dims[0].label = 'batch' hdf_cases.dims[0].label = 'batch' hdf_cases.dims[1].label = 'index' hdf_mult.dims[0].label = 'batch' hdf_mult.dims[1].label = 'index' ### loading train i = 0 with progress_bar('train_data ', n_train) as bar: for c in cases_train: [d,l], m = get_data(train_contour_path, train_img_path, c) train_images = numpy.array(d) label_images = numpy.array(l) assert(train_images.shape == label_images.shape) hdf_shapes[i] = train_images.shape hdf_features[i] = train_images.flatten().astype(numpy.dtype('uint16')) hdf_mult[i] = m hdf_labels[i] = label_images.flatten().astype(numpy.dtype('uint16')) hdf_cases[i] = i i += 1 bar.update(i) with progress_bar('online_data ', n_online) as bar: for c in cases_online:
def convert_dogs_vs_cats(directory, output_directory, output_filename='dogs_vs_cats.hdf5'): """Converts the Dogs vs. Cats dataset to HDF5. Converts the Dogs vs. Cats dataset to an HDF5 dataset compatible with :class:`fuel.datasets.dogs_vs_cats`. The converted dataset is saved as 'dogs_vs_cats.hdf5'. It assumes the existence of the following files: * `dogs_vs_cats.train.zip` * `dogs_vs_cats.test1.zip` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'dogs_vs_cats.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ # Prepare output file output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf_features = h5file.create_dataset('image_features', (37500,), dtype=dtype) hdf_shapes = h5file.create_dataset('image_features_shapes', (37500, 3), dtype='int32') hdf_labels = h5file.create_dataset('targets', (37500, 1), dtype='uint8') # Attach shape annotations and scales hdf_features.dims.create_scale(hdf_shapes, 'shapes') hdf_features.dims[0].attach_scale(hdf_shapes) hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3,), dtype='S7') hdf_shapes_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_features.dims[0].attach_scale(hdf_shapes_labels) # Add axis annotations hdf_features.dims[0].label = 'batch' hdf_labels.dims[0].label = 'batch' hdf_labels.dims[1].label = 'index' # Convert i = 0 for split, split_size in zip([TRAIN, TEST], [25000, 12500]): # Open the ZIP file filename = os.path.join(directory, split) zip_file = zipfile.ZipFile(filename, 'r') image_names = zip_file.namelist()[1:] # Discard the directory name # Shuffle the examples rng = numpy.random.RandomState(123522) rng.shuffle(image_names) # Convert from JPEG to NumPy arrays with progress_bar(filename, split_size) as bar: for image_name in image_names: # Save image image = numpy.array(Image.open(zip_file.open(image_name))) image = image.transpose(2, 0, 1) hdf_features[i] = image.flatten() hdf_shapes[i] = image.shape # Cats are 0, Dogs are 1 hdf_labels[i] = 0 if 'cat' in image_name else 1 # Update progress i += 1 bar.update(i if split == TRAIN else i - 25000) # Add the labels split_dict = {} sources = ['image_features', 'targets'] for name, slice_ in zip(['train', 'test'], [(0, 25000), (25000, 37500)]): split_dict[name] = dict(zip(sources, [slice_] * len(sources))) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return (output_path,)
output_path = basepath + 'data_tiff.hdf5' h5file = h5py.File(output_path, mode='w') hdf_features = h5file.create_dataset('features', (nb_test_examples + nb_training_examples, nb_channels, length, width), dtype='int32') hdf_labels = h5file.create_dataset('labels', (nb_test_examples + nb_training_examples, nb_labels), dtype='int8') hdf_names = h5file.create_dataset('image_name', (nb_test_examples + nb_training_examples,), dtype='S20') hdf_features.dims[0].label = 'batch' hdf_features.dims[2].label = 'height' hdf_features.dims[3].label = 'width' hdf_features.dims[1].label = 'channels' hdf_labels.dims[0].label = 'batch' hdf_labels.dims[1].label = 'labels' maximums = np.zeros(4) with progress_bar('train', nb_training_examples) as bar: with open(basepath + 'train_v2.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile) count = 0 for row in spamreader: if row[0].startswith('train_'): train_idx = int(row[0].split('_')[-1]) p = glob.glob(basepath + 'train-tif-v2/train_{0}.tif'.format(train_idx)) hdf_names[count] = 'train_{0}.tif'.format(train_idx) # tiff image img = io.imread(p[0]) rescaleIMG = np.reshape(img[:,:,-1], (-1, 1)) rescaleIMG = scaler.fit_transform(rescaleIMG.astype(np.float32)) img_scaled = (np.reshape(rescaleIMG, img[:,:,-1].shape)).astype(np.uint8) #img = np.moveaxis(img_scaled[:,:,:nb_channels], 2, 0)[np.asarray([2,1,0,3])] # move channel axis + rgb
hdf_features_nocar_int.dims[0].label = 'batch' hdf_features_nocar_int.dims[1].label = 'features_nocar_int' hdf_labels.dims[0].label = 'batch' hdf_labels.dims[1].label = 'labels' hdf_cp.dims[0].label = 'batch' hdf_cp.dims[1].label = 'codepostal' hdf_hascar.dims[0].label = 'batch' hdf_hascar.dims[1].label = 'hascar' missing_codepostaux = [] missing_departements = [] for set_label, data in [('train', data_train), ('submit', data_submit)]: start_i = 0 if set_label == 'train' else len(data_train) with progress_bar(set_label, len(data)) as bar: for i, row in data.iterrows(): # does the dude have a car ? has_car = row['marque'] != 'NR' hdf_hascar[start_i + i] = 1 if has_car else 0 # categorical features feature_onehot_car_cat = numpy.zeros(total_uniques_car) feature_onehot_nocar_cat = numpy.zeros(total_uniques_nocar) for column_name in list_categorical: try: if column_name in list_car: feature_onehot_car_cat[ uniques_startindex_car[column_name] + unique_to_index[column_name][row[column_name]]] = 1 else:
hdf_features.dims[0].label = 'batch' hdf_features.dims[1].label = 'channel' hdf_features.dims[2].label = 'height' hdf_features.dims[3].label = 'width' hdf_labels.dims[0].label = 'batch' hdf_labels.dims[1].label = 'channel' hdf_labels.dims[2].label = 'height' hdf_labels.dims[3].label = 'width' # Create matching for creating valid and train set # range for selecting upper body joints r = range(0, 8) + range(14, 20) + [26] # build hdf5 train and submit with progress_bar('train', n_examples_train) as bar: for j in range(n_examples_train): assert (re.findall(r'\d+', features_train_files[j]) == re.findall( r'\d+', labels_train_files[j])) hdf_features[j] = np.rollaxis(get_im_cv2(features_train_files[j]), 2) hdf_labels[j] = np.rollaxis(loadmat(labels_train_files[j])['map'], 2)[r] bar.update(j) with progress_bar('valid', n_examples_valid) as bar: for j in range(n_examples_valid): assert (re.findall(r'\d+', features_val_files[j]) == re.findall( r'\d+', labels_val_files[j])) hdf_features[n_examples_train + j] = np.rollaxis( get_im_cv2(features_val_files[j]), 2) hdf_labels[n_examples_train + j] = np.rollaxis(
def convert_captcha(directory, output_directory, output_filename='captcha.hdf5'): """Converts captcha dataset to HDF5. Converts captcha to an HDF5 dataset compatible with :class:`fuel.datasets.captcha`. The converted dataset is saved as 'captcha.hdf5'. It assumes the existence of the directory: ./captcha/lineImages ./captcha/ascii-all Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'captcha.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ # Prepare input all_example_paths = get_example_list(os.path.join(directory,'captcha','ans.txt')) split = "all" split_size = len(all_example_paths) # Prepare output file output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf_features = h5file.create_dataset('image_features', (split_size,), dtype=dtype) hdf_shapes = h5file.create_dataset('image_features_shapes', (split_size, 3), dtype='int32') hdf_targets = h5file.create_dataset('targets', (split_size,), dtype=h5py.special_dtype(vlen=bytes)) hdf_targets_shapes = h5file.create_dataset('hdf_targets_shapes', (split_size, 1), dtype='int32') # Attach shape annotations and scales hdf_features.dims.create_scale(hdf_shapes, 'shapes') hdf_features.dims[0].attach_scale(hdf_shapes) hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3,), dtype='S7') hdf_shapes_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_features.dims[0].attach_scale(hdf_shapes_labels) hdf_targets.dims.create_scale(hdf_targets_shapes, 'targets_shapes') hdf_targets.dims[0].attach_scale(hdf_targets_shapes) # Add axis annotations hdf_features.dims[0].label = 'batch' hdf_targets.dims[0].label = 'batch' #hdf_targets.dims[1].label = 'index' # Shuffle the examples rng = numpy.random.RandomState(123522) rng.shuffle(all_example_paths) # Convert from JPEG to NumPy arrays with progress_bar(split, split_size) as bar: i = 0 for tag, image_path in all_example_paths: # Save image image = numpy.array(Image.open(image_path)) #(height, width, channel) to (channel, height, width) image = image.transpose(2, 0, 1) print image.shape hdf_features[i] = image.flatten() hdf_shapes[i] = image.shape print image.shape #get the target textline = tag print textline hdf_targets[i] = textline #numpy.array(textline) hdf_targets_shapes[i] = len(textline) # Update progress i += 1 bar.update(i) # Add the labels split_dict = {} sources = ['image_features', 'targets'] split_dict['all'] = dict(zip(sources, [(0, split_size)] * 2)) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() return (output_path,)
def convert_jpgtgz(target, onlytarget, directory, output_directory, output_filename=None): """Converts jpg tar.gz dataset to HDF5. Converts a jpg tar.gz dataset to an HDF5 dataset. Parameters ---------- target: bool Also addd a targets source to the file. The targets are computed based on train/test.target.csv. Each image receive two values [0] - is the target value and [1] - is a mask bit saying if a target is at all defined for the image onlytarget: bool same as target but dont take images that dont have a target value defined. there is no mask directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'jpg.hdf5' Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ dotarget = target | onlytarget # if onlytarget then there is just one value otherwise # this will have two values 0-target 1-mask saying if the target should be used target_dim = 1 if onlytarget else 2 if not output_filename: output_filename = 'jpg.hdf5' output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') try: TMPDIR = tempfile.mkdtemp() sources = ('features','targets') if dotarget else ('features',) source_dtypes = dict([(source, 'uint8') for source in sources]) source_axis_labels = { 'features': ('channel', 'height', 'width'), 'targets': ('index',), } splits = ('train','test') file_paths = dict(zip(splits, FORMAT_1_FILES)) for split, path in file_paths.items(): file_paths[split] = os.path.join(directory, path) target_paths = dict(zip(splits, TARGET_FILES)) for split, path in target_paths.items(): target_paths[split] = os.path.join(directory, path) split_targets = {} for split in splits: try: targets = pd.read_csv(target_paths[split], index_col='name') except: targets = None split_targets[split] = targets def get_target(image_path, split): try: root_basename = os.path.splitext(os.path.basename(image_path))[0] target = split_targets[split].loc[root_basename].target mask = 1 except: target = 0 mask = 0 return target, mask # We first extract the data files in a temporary directory. While doing # that, we also count the number of examples for each split. Files are # extracted individually, which allows to display a progress bar. Since # the splits will be concatenated in the HDF5 file, we also compute the # start and stop intervals of each split within the concatenated array. checksums = set([]) def extract_tar(split): num_examples = 0 path = file_paths[split] if os.path.isfile(path): with tarfile.open(path, 'r:gz') as f: members = f.getmembers() progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): if ((member.name.endswith('.jpg') and not os.path.basename(member.name).startswith('.'))): f.extract(member, path=os.path.join(TMPDIR,split)) num_examples += 1 bar.update(i) DIR = TMPDIR elif os.path.isdir(path): print("reading DIRECTORY %s"%path) DIR = path for root, dirs, files in os.walk(path): for file in files: if file.endswith('.jpg') and not file.startswith('.'): num_examples += 1 else: print("No file or directory named %s"%path) return [], None print('#files=%d'%num_examples) jpgfiles = [] progress_bar_context = progress_bar( name='{} file'.format(split), maxval=num_examples, prefix='Validating') num_examples = 0 bad_examples = 0 duplicate_examples = 0 count = 0 errors = 0 shape = None # all images must have the same shape with progress_bar_context as bar: for root, dirs, files in os.walk(os.path.join(DIR,split)): for file in files: if file.endswith('.jpg') and not file.startswith('.'): image_path = os.path.join(root, file) count += 1 try: im = Image.open(image_path) im = numpy.asarray(im) m = hashlib.md5() m.update(im) h = m.hexdigest() if shape is None: shape = im.shape if im.shape != shape: bad_examples += 1 os.remove(image_path) elif h in checksums: duplicate_examples += 1 os.remove(image_path) else: checksums.add(h) num_examples += 1 jpgfiles.append(image_path) except: errors += 1 bar.update(count) print('count=%d bad=%d dup=%d good=%d errors=%d'%( count, bad_examples, duplicate_examples, num_examples, errors)) if onlytarget: jpgfiles = filter(lambda x: get_target(image_path, split)[1], jpgfiles) return jpgfiles, shape examples_per_split = OrderedDict( [(split, extract_tar(split)) for split in splits]) cumulative_num_examples = numpy.cumsum( [0] + list(map(lambda x: len(x[0]),examples_per_split.values()))) num_examples = cumulative_num_examples[-1] intervals = zip(cumulative_num_examples[:-1], cumulative_num_examples[1:]) split_intervals = dict(zip(splits, intervals)) # The start and stop indices are used to create a split dict that will # be parsed into the split array required by the H5PYDataset interface. # The split dict is organized as follows: # # dict(split -> dict(source -> (start, stop))) # split_dict = OrderedDict([ (split, OrderedDict([(s, split_intervals[split]) for s in sources])) for split in splits]) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) # We then prepare the HDF5 dataset. This involves creating datasets to # store data sources and datasets to store auxiliary information # (namely the shapes for variable-length axes, and labels to indicate # what these variable-length axes represent). def make_vlen_dataset(source, shape): dtype = source_dtypes[source] shape = (num_examples,)+shape print("creating %s %s %s"%(source,str(shape),str(dtype))) dataset = h5file.create_dataset( source, shape, dtype=dtype) # Tag fixed-length axis with its label dataset.dims[0].label = 'batch' for i, label in enumerate(source_axis_labels[source]): dataset.dims[i+1].label = label shapes = filter(None,map(lambda x: x[1],examples_per_split.values())) assert len(set(shapes)) == 1, "splits have different image size %s"%shapes print('Images shape %s'%str(shapes[0])) source_shape = {'features':shapes[0], 'targets':(target_dim,)} for source in sources: make_vlen_dataset(source, source_shape[source]) # The final step is to fill the HDF5 file. def fill_split(split, bar=None): print(split) image_count = target_count = 0 for image_number, image_path in enumerate(examples_per_split[split][0]): image = numpy.asarray(Image.open(image_path)) index = image_number + split_intervals[split][0] h5file['features'][index] = image image_count += 1 target, mask = get_target(image_path, split) if dotarget: if onlytarget: h5file['targets'][index] = numpy.array([target,]) else: h5file['targets'][index] = numpy.array([target,mask]) target_count += mask if image_number % 1000 == 0: h5file.flush() if bar: bar.update(index) print('# targets %d out of %d'%(target_count, image_count)) with progress_bar('jpgtgz', num_examples) as bar: for split in splits: fill_split(split, bar=bar) finally: if os.path.isdir(TMPDIR): shutil.rmtree(TMPDIR) h5file.flush() h5file.close() return (output_path,)
def extract_tar(split): num_examples = 0 path = file_paths[split] if os.path.isfile(path): with tarfile.open(path, 'r:gz') as f: members = f.getmembers() progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): if ((member.name.endswith('.jpg') and not os.path.basename(member.name).startswith('.'))): f.extract(member, path=os.path.join(TMPDIR,split)) num_examples += 1 bar.update(i) DIR = TMPDIR elif os.path.isdir(path): print("reading DIRECTORY %s"%path) DIR = path for root, dirs, files in os.walk(path): for file in files: if file.endswith('.jpg') and not file.startswith('.'): num_examples += 1 else: print("No file or directory named %s"%path) return [], None print('#files=%d'%num_examples) jpgfiles = [] progress_bar_context = progress_bar( name='{} file'.format(split), maxval=num_examples, prefix='Validating') num_examples = 0 bad_examples = 0 duplicate_examples = 0 count = 0 errors = 0 shape = None # all images must have the same shape with progress_bar_context as bar: for root, dirs, files in os.walk(os.path.join(DIR,split)): for file in files: if file.endswith('.jpg') and not file.startswith('.'): image_path = os.path.join(root, file) count += 1 try: im = Image.open(image_path) im = numpy.asarray(im) m = hashlib.md5() m.update(im) h = m.hexdigest() if shape is None: shape = im.shape if im.shape != shape: bad_examples += 1 os.remove(image_path) elif h in checksums: duplicate_examples += 1 os.remove(image_path) else: checksums.add(h) num_examples += 1 jpgfiles.append(image_path) except: errors += 1 bar.update(count) print('count=%d bad=%d dup=%d good=%d errors=%d'%( count, bad_examples, duplicate_examples, num_examples, errors)) if onlytarget: jpgfiles = filter(lambda x: get_target(image_path, split)[1], jpgfiles) return jpgfiles, shape
def convert_iam_ondb(directory, output_directory, output_filename='iam_ondb.hdf5'): """Converts iam_ondb dataset to HDF5. Converts iam_ondb to an HDF5 dataset compatible with :class:`fuel.datasets.iam_ondb`. The converted dataset is saved as 'iam_ondb.hdf5'. It assumes the existence of the directory: ./iam_ondb/lineImages ./iam_ondb/ascii-all Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'iam_ondb.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ # Prepare input all_image_paths = get_path_list(directory) total_num = len(all_image_paths) test_num = int(total_num/4.0) train_num = total_num - test_num # Prepare output file output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf_features = h5file.create_dataset('image_features', (total_num,), dtype=dtype) hdf_shapes = h5file.create_dataset('image_features_shapes', (total_num, 3), dtype='int32') hdf_labels = h5file.create_dataset('targets', (total_num,), dtype=h5py.special_dtype(vlen=bytes)) hdf_labels_shapes = h5file.create_dataset('hdf_labels_shapes', (total_num, 1), dtype='int32') # Attach shape annotations and scales hdf_features.dims.create_scale(hdf_shapes, 'shapes') hdf_features.dims[0].attach_scale(hdf_shapes) hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3,), dtype='S7') hdf_shapes_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_features.dims[0].attach_scale(hdf_shapes_labels) hdf_labels.dims.create_scale(hdf_labels_shapes, 'lables_shapes') hdf_labels.dims[0].attach_scale(hdf_labels_shapes) # Add axis annotations hdf_features.dims[0].label = 'batch' hdf_labels.dims[0].label = 'batch' #hdf_labels.dims[1].label = 'index' # Convert i = 0 for split, split_size in zip(["train", "test"], [train_num, test_num]): # Shuffle the examples rng = numpy.random.RandomState(123522) if split=="train": image_paths = all_image_paths[0:train_num] else: image_paths = all_image_paths[train_num:] rng.shuffle(image_paths) # Convert from JPEG to NumPy arrays with progress_bar(split, split_size) as bar: for image_path in image_paths: # Save image image = numpy.array(Image.open(image_path)) print image.shape if image.ndim!=2: continue # Add a channels axis image = image[numpy.newaxis,:,:] hdf_features[i] = image.flatten() hdf_shapes[i] = image.shape print image.shape #get the target textline = get_target(image_path) print textline hdf_labels[i] = textline #numpy.array(textline) hdf_labels_shapes[i] = len(textline) # Update progress i += 1 bar.update(i if split == "train" else i - train_num) # Add the labels split_dict = {} sources = ['image_features', 'targets'] split_dict['train'] = dict(zip(sources, [(0, train_num)] * 2)) split_dict['test'] = dict(zip(sources, [(train_num, total_num)] * 2)) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) # data = (('train', 'features', train_features), # ('train', 'targets', train_labels), # ('test', 'features', test_features), # ('test', 'targets', test_labels)) # fill_hdf5_file(h5file, data) # h5file['features'].dims[0].label = 'batch' # h5file['features'].dims[1].label = 'channel' # h5file['features'].dims[2].label = 'height' # h5file['features'].dims[3].label = 'width' # h5file['targets'].dims[0].label = 'batch' # h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def convert_iam_ondb(directory, output_directory, output_filename='iam_ondb.hdf5'): """Converts iam_ondb dataset to HDF5. Converts iam_ondb to an HDF5 dataset compatible with :class:`fuel.datasets.iam_ondb`. The converted dataset is saved as 'iam_ondb.hdf5'. It assumes the existence of the directory: ./iam_ondb/lineImages ./iam_ondb/ascii-all Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'iam_ondb.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ # Prepare input all_image_paths = get_path_list(directory) total_num = len(all_image_paths) test_num = int(total_num / 4.0) train_num = total_num - test_num # Prepare output file output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf_features = h5file.create_dataset('image_features', (total_num, ), dtype=dtype) hdf_shapes = h5file.create_dataset('image_features_shapes', (total_num, 3), dtype='int32') hdf_labels = h5file.create_dataset('targets', (total_num, ), dtype=h5py.special_dtype(vlen=bytes)) hdf_labels_shapes = h5file.create_dataset('hdf_labels_shapes', (total_num, 1), dtype='int32') # Attach shape annotations and scales hdf_features.dims.create_scale(hdf_shapes, 'shapes') hdf_features.dims[0].attach_scale(hdf_shapes) hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3, ), dtype='S7') hdf_shapes_labels[...] = [ 'channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8') ] hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_features.dims[0].attach_scale(hdf_shapes_labels) hdf_labels.dims.create_scale(hdf_labels_shapes, 'lables_shapes') hdf_labels.dims[0].attach_scale(hdf_labels_shapes) # Add axis annotations hdf_features.dims[0].label = 'batch' hdf_labels.dims[0].label = 'batch' #hdf_labels.dims[1].label = 'index' # Convert i = 0 for split, split_size in zip(["train", "test"], [train_num, test_num]): # Shuffle the examples rng = numpy.random.RandomState(123522) if split == "train": image_paths = all_image_paths[0:train_num] else: image_paths = all_image_paths[train_num:] rng.shuffle(image_paths) # Convert from JPEG to NumPy arrays with progress_bar(split, split_size) as bar: for image_path in image_paths: # Save image image = numpy.array(Image.open(image_path)) print image.shape if image.ndim != 2: continue # Add a channels axis image = image[numpy.newaxis, :, :] hdf_features[i] = image.flatten() hdf_shapes[i] = image.shape print image.shape #get the target textline = get_target(image_path) print textline hdf_labels[i] = textline #numpy.array(textline) hdf_labels_shapes[i] = len(textline) # Update progress i += 1 bar.update(i if split == "train" else i - train_num) # Add the labels split_dict = {} sources = ['image_features', 'targets'] split_dict['train'] = dict(zip(sources, [(0, train_num)] * 2)) split_dict['test'] = dict(zip(sources, [(train_num, total_num)] * 2)) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) # data = (('train', 'features', train_features), # ('train', 'targets', train_labels), # ('test', 'features', test_features), # ('test', 'targets', test_labels)) # fill_hdf5_file(h5file, data) # h5file['features'].dims[0].label = 'batch' # h5file['features'].dims[1].label = 'channel' # h5file['features'].dims[2].label = 'height' # h5file['features'].dims[3].label = 'width' # h5file['targets'].dims[0].label = 'batch' # h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path, )