def convert_tiffs(new_tiff_filenames, new_hdf5_pathname, axis=0, channel=0, z_index=0, pages_to_channel=1, memmap=False): """ Convert a stack of tiffs to an HDF5 file. Args: new_tiff_filenames(list or str): takes a str for a single file or a list of strs for filenames to combine (allows regex). new_hdf5_pathname(str): the HDF5 file and location to store the dataset. axis(int): which axis to concatenate along. channel(int): which channel to select for the HDF5 (can only keep one). z_index(int): which z value to take (the algorithm is not setup for 3D data yet) pages_to_channel(int): if channels are not normally stored in the channel variable, but are stored as pages, then this will split neighboring pages into separate channels. memmap(bool): allows one to load the array using a memory mapped file as opposed to reading it directly. (by default is False) """ assert (pages_to_channel > 0) # Get the axes that do not change static_axes = numpy.array(list(iters.xrange_with_skip(3, to_skip=axis))) # if it is only a single str, make it a singleton list if isinstance(new_tiff_filenames, (bytes, unicode)): new_tiff_filenames = [new_tiff_filenames] # Expand any regex in path names new_tiff_filenames = xglob.expand_pathname_list(*new_tiff_filenames) # Extract the offset and descriptions for storage. new_hdf5_dataset_filenames = list() new_hdf5_dataset_offsets = list() # Determine the shape and dtype to use for the dataset (so that everything # will fit). new_hdf5_dataset_shape = numpy.zeros((3, ), dtype=int) new_hdf5_dataset_dtype = bool for each_new_tiff_filename in new_tiff_filenames: # Add each filename. new_hdf5_dataset_filenames.append(unicode(each_new_tiff_filename)) # Get all of the offsets. new_hdf5_dataset_offsets.append(new_hdf5_dataset_shape[axis]) # Get the shape and type of each frame. each_new_tiff_file_shape, each_new_tiff_file_dtype = list( get_multipage_tiff_shape_dtype_transformed( each_new_tiff_filename, axis_order="cztyx", pages_to_channel=pages_to_channel).values()) each_new_tiff_file_shape = each_new_tiff_file_shape[2:] # Find the increase on the merge axis. Find the largest shape for the # rest. each_new_tiff_file_shape = numpy.array(each_new_tiff_file_shape) new_hdf5_dataset_shape[axis] += each_new_tiff_file_shape[axis] new_hdf5_dataset_shape[static_axes] = numpy.array([ new_hdf5_dataset_shape[static_axes], each_new_tiff_file_shape[static_axes] ]).max(axis=0) # Finds the best type that everything can be cast to without loss of # precision. if not numpy.can_cast(each_new_tiff_file_dtype, new_hdf5_dataset_dtype): if numpy.can_cast(new_hdf5_dataset_dtype, each_new_tiff_file_dtype): new_hdf5_dataset_dtype = each_new_tiff_file_dtype else: raise Exception("Cannot find safe conversion between" + " new_hdf5_dataset_dtype = " + repr(new_hdf5_dataset_dtype) + " and each_new_tiff_file_dtype = " + repr(each_new_tiff_file_dtype) + ".") # Convert to arrays. new_hdf5_dataset_filenames = numpy.array(new_hdf5_dataset_filenames) new_hdf5_dataset_offsets = numpy.array(new_hdf5_dataset_offsets) # Convert to standard forms new_hdf5_dataset_shape = tuple(new_hdf5_dataset_shape) new_hdf5_dataset_dtype = numpy.dtype(new_hdf5_dataset_dtype) # Get all the needed locations for the HDF5 file and dataset new_hdf5_filename, new_hdf5_dataset_name = hdf5.serializers.split_hdf5_path( new_hdf5_pathname) new_hdf5_groupname = os.path.dirname(new_hdf5_dataset_name) # Dump all datasets to the file with h5py.File(new_hdf5_filename, "a") as new_hdf5_file: new_hdf5_group = new_hdf5_file.require_group(new_hdf5_groupname) new_hdf5_dataset = new_hdf5_group.create_dataset( new_hdf5_dataset_name, new_hdf5_dataset_shape, new_hdf5_dataset_dtype, chunks=True) new_hdf5_dataset.attrs.create("filenames", new_hdf5_dataset_filenames, shape=new_hdf5_dataset_filenames.shape, dtype=h5py.special_dtype(vlen=unicode)) new_hdf5_dataset.attrs["offsets"] = new_hdf5_dataset_offsets # Workaround required due to this issue # ( https://github.com/h5py/h5py/issues/289 ). new_hdf5_descriptions_dataset = new_hdf5_group.create_dataset( "_".join([new_hdf5_dataset_name, "descriptions"]), shape=new_hdf5_dataset_shape[0:1], dtype=h5py.special_dtype(vlen=unicode)) new_hdf5_dataset.attrs["descriptions"] = ( new_hdf5_descriptions_dataset.file.filename + new_hdf5_descriptions_dataset.name) new_hdf5_dataset_axis_pos = 0 for each_new_tiff_filename in new_tiff_filenames: # Log the filename in case something goes wrong. trace_logger.info("Now appending TIFF: \"" + str(each_new_tiff_filename) + "\"") # Read the data in the format specified. each_new_tiff_array, each_new_tiff_description = get_standard_tiff_data( each_new_tiff_filename, axis_order="cztyx", pages_to_channel=pages_to_channel, memmap=memmap) # Take channel and z selection # TODO: Could we drop the channel constraint? # TODO: Want to drop z constraint. each_new_tiff_array = each_new_tiff_array[channel, z_index] each_new_tiff_description = each_new_tiff_description[channel] # Store into the current slice and go to the next one. new_hdf5_dataset_axis_pos_next = new_hdf5_dataset_axis_pos + \ len(each_new_tiff_array) new_hdf5_dataset[ new_hdf5_dataset_axis_pos: new_hdf5_dataset_axis_pos_next] = each_new_tiff_array new_hdf5_descriptions_dataset[ new_hdf5_dataset_axis_pos: new_hdf5_dataset_axis_pos_next] = each_new_tiff_description new_hdf5_dataset_axis_pos = new_hdf5_dataset_axis_pos_next
def convert_tiffs(new_tiff_filenames, new_hdf5_pathname, axis=0, channel=0, z_index=0, pages_to_channel=1, memmap=False): """ Convert a stack of tiffs to an HDF5 file. Args: new_tiff_filenames(list or str): takes a str for a single file or a list of strs for filenames to combine (allows regex). new_hdf5_pathname(str): the HDF5 file and location to store the dataset. axis(int): which axis to concatenate along. channel(int): which channel to select for the HDF5 (can only keep one). z_index(int): which z value to take (the algorithm is not setup for 3D data yet) pages_to_channel(int): if channels are not normally stored in the channel variable, but are stored as pages, then this will split neighboring pages into separate channels. memmap(bool): allows one to load the array using a memory mapped file as opposed to reading it directly. (by default is False) """ assert (pages_to_channel > 0) # Get the axes that do not change static_axes = numpy.array(list(iters.xrange_with_skip( 3, to_skip=axis ))) # if it is only a single str, make it a singleton list if isinstance(new_tiff_filenames, (bytes, unicode)): new_tiff_filenames = [new_tiff_filenames] # Expand any regex in path names new_tiff_filenames = xglob.expand_pathname_list(*new_tiff_filenames) # Extract the offset and descriptions for storage. new_hdf5_dataset_filenames = list() new_hdf5_dataset_offsets = list() # Determine the shape and dtype to use for the dataset (so that everything # will fit). new_hdf5_dataset_shape = numpy.zeros((3,), dtype=int) new_hdf5_dataset_dtype = bool for each_new_tiff_filename in new_tiff_filenames: # Add each filename. new_hdf5_dataset_filenames.append(unicode(each_new_tiff_filename)) # Get all of the offsets. new_hdf5_dataset_offsets.append(new_hdf5_dataset_shape[axis]) # Get the shape and type of each frame. each_new_tiff_file_shape, each_new_tiff_file_dtype = list(get_multipage_tiff_shape_dtype_transformed( each_new_tiff_filename, axis_order="cztyx", pages_to_channel=pages_to_channel ).values()) each_new_tiff_file_shape = each_new_tiff_file_shape[2:] # Find the increase on the merge axis. Find the largest shape for the # rest. each_new_tiff_file_shape = numpy.array(each_new_tiff_file_shape) new_hdf5_dataset_shape[axis] += each_new_tiff_file_shape[axis] new_hdf5_dataset_shape[static_axes] = numpy.array( [ new_hdf5_dataset_shape[static_axes], each_new_tiff_file_shape[static_axes] ] ).max(axis=0) # Finds the best type that everything can be cast to without loss of # precision. if not numpy.can_cast(each_new_tiff_file_dtype, new_hdf5_dataset_dtype): if numpy.can_cast(new_hdf5_dataset_dtype, each_new_tiff_file_dtype): new_hdf5_dataset_dtype = each_new_tiff_file_dtype else: raise Exception( "Cannot find safe conversion between" + " new_hdf5_dataset_dtype = " + repr(new_hdf5_dataset_dtype) + " and each_new_tiff_file_dtype = " + repr(each_new_tiff_file_dtype) + "." ) # Convert to arrays. new_hdf5_dataset_filenames = numpy.array(new_hdf5_dataset_filenames) new_hdf5_dataset_offsets = numpy.array(new_hdf5_dataset_offsets) # Convert to standard forms new_hdf5_dataset_shape = tuple(new_hdf5_dataset_shape) new_hdf5_dataset_dtype = numpy.dtype(new_hdf5_dataset_dtype) # Get all the needed locations for the HDF5 file and dataset new_hdf5_filename, new_hdf5_dataset_name = hdf5.serializers.split_hdf5_path(new_hdf5_pathname) new_hdf5_groupname = os.path.dirname(new_hdf5_dataset_name) # Dump all datasets to the file with h5py.File(new_hdf5_filename, "a") as new_hdf5_file: new_hdf5_group = new_hdf5_file.require_group(new_hdf5_groupname) new_hdf5_dataset = new_hdf5_group.create_dataset( new_hdf5_dataset_name, new_hdf5_dataset_shape, new_hdf5_dataset_dtype, chunks=True ) new_hdf5_dataset.attrs.create( "filenames", new_hdf5_dataset_filenames, shape=new_hdf5_dataset_filenames.shape, dtype=h5py.special_dtype(vlen=unicode) ) new_hdf5_dataset.attrs["offsets"] = new_hdf5_dataset_offsets # Workaround required due to this issue # ( https://github.com/h5py/h5py/issues/289 ). new_hdf5_descriptions_dataset = new_hdf5_group.create_dataset( "_".join([new_hdf5_dataset_name, "descriptions"]), shape=new_hdf5_dataset_shape[0:1], dtype=h5py.special_dtype(vlen=unicode) ) new_hdf5_dataset.attrs["descriptions"] = ( new_hdf5_descriptions_dataset.file.filename + new_hdf5_descriptions_dataset.name ) new_hdf5_dataset_axis_pos = 0 for each_new_tiff_filename in new_tiff_filenames: # Log the filename in case something goes wrong. trace_logger.info( "Now appending TIFF: \"" + str(each_new_tiff_filename) + "\"" ) # Read the data in the format specified. each_new_tiff_array, each_new_tiff_description = get_standard_tiff_data( each_new_tiff_filename, axis_order="cztyx", pages_to_channel=pages_to_channel, memmap=memmap ) # Take channel and z selection # TODO: Could we drop the channel constraint? # TODO: Want to drop z constraint. each_new_tiff_array = each_new_tiff_array[channel, z_index] each_new_tiff_description = each_new_tiff_description[channel] # Store into the current slice and go to the next one. new_hdf5_dataset_axis_pos_next = new_hdf5_dataset_axis_pos + \ len(each_new_tiff_array) new_hdf5_dataset[new_hdf5_dataset_axis_pos:new_hdf5_dataset_axis_pos_next] = each_new_tiff_array new_hdf5_descriptions_dataset[new_hdf5_dataset_axis_pos:new_hdf5_dataset_axis_pos_next] = each_new_tiff_description new_hdf5_dataset_axis_pos = new_hdf5_dataset_axis_pos_next
def convert_tiffs(new_tiff_filenames, new_hdf5_pathname, axis = 0, channel = 0, z_index = 0, pages_to_channel = 1): """ Convert a stack of tiffs to an HDF5 file. Args: new_tiff_filenames(list or str): takes a str for a single file or a list of strs for filenames to combine (allows regex). new_hdf5_pathname(str): the HDF5 file and location to store the dataset. axis(int): which axis to concatenate along. channel(int): which channel to select for the HDF5 (can only keep one). z_index(int): which z value to take (the algorithm is not setup for 3D data yet) pages_to_channel(int): if channels are not normally stored in the channel variable, but are stored as pages, then this will split neighboring pages into separate channels. """ assert (pages_to_channel > 0) # Get the axes that do not change static_axes = numpy.array(list(iters.xrange_with_skip(3, to_skip = axis))) # if it is only a single str, make it a singleton list if isinstance(new_tiff_filenames, str): new_tiff_filenames = [new_tiff_filenames] # Expand any regex in path names new_tiff_filenames = xglob.expand_pathname_list(*new_tiff_filenames) # Determine the shape and dtype to use for the dataset (so that everything will fit). new_hdf5_dataset_shape = numpy.zeros((3,), dtype = int) new_hdf5_dataset_dtype = bool for each_new_tiff_filename in new_tiff_filenames: each_new_tiff_file_shape, each_new_tiff_file_dtype = get_multipage_tiff_shape_dtype_transformed(each_new_tiff_filename, axis_order = "cztyx", pages_to_channel = pages_to_channel).values() each_new_tiff_file_shape = each_new_tiff_file_shape[2:] # Find the increase on the merge axis. Find the largest shape for the rest. each_new_tiff_file_shape = numpy.array(each_new_tiff_file_shape) new_hdf5_dataset_shape[axis] += each_new_tiff_file_shape[axis] new_hdf5_dataset_shape[static_axes] = numpy.array([new_hdf5_dataset_shape[static_axes], each_new_tiff_file_shape[static_axes]]).max(axis=0) # Finds the best type that everything can be cast to without loss of precision. if not numpy.can_cast(each_new_tiff_file_dtype, new_hdf5_dataset_dtype): if numpy.can_cast(new_hdf5_dataset_dtype, each_new_tiff_file_dtype): new_hdf5_dataset_dtype = each_new_tiff_file_dtype else: raise Exception("Cannot find safe conversion between new_hdf5_dataset_dtype = " + repr(new_hdf5_dataset_dtype) + " and each_new_tiff_file_dtype = " + repr(each_new_tiff_file_dtype) + ".") # Convert to standard forms new_hdf5_dataset_shape = tuple(new_hdf5_dataset_shape) new_hdf5_dataset_dtype = numpy.dtype(new_hdf5_dataset_dtype) # Get all the needed locations for the HDF5 file and dataset new_hdf5_path_components = pathHelpers.PathComponents(new_hdf5_pathname) new_hdf5_filename = new_hdf5_path_components.externalPath new_hdf5_groupname = new_hdf5_path_components.internalDirectory new_hdf5_dataset_name = new_hdf5_path_components.internalPath # Dump all datasets to the file with h5py.File(new_hdf5_filename, "a") as new_hdf5_file: if new_hdf5_groupname not in new_hdf5_file: new_hdf5_file.create_group(new_hdf5_groupname) new_hdf5_group = new_hdf5_file[new_hdf5_groupname] new_hdf5_dataset = new_hdf5_group.create_dataset(new_hdf5_dataset_name, new_hdf5_dataset_shape, new_hdf5_dataset_dtype, chunks=True) new_hdf5_dataset_axis_pos = 0 for each_new_tiff_filename in new_tiff_filenames: # Read the data in the format specified. each_new_tiff_array = get_standard_tiff_array(each_new_tiff_filename, axis_order = "cztyx", pages_to_channel = pages_to_channel) # Take channel and z selection # TODO: Could we drop the channel constraint by saving different channels to different arrays? Need to think about it. # TODO: Want to drop z constraint, but need to consult with Ferran about algorithms that work on 3D for the end. each_new_tiff_array = each_new_tiff_array[channel, z_index] # Store into the current slice and go to the next one. new_hdf5_dataset_axis_pos_next = new_hdf5_dataset_axis_pos + len(each_new_tiff_array) new_hdf5_dataset[ new_hdf5_dataset_axis_pos : new_hdf5_dataset_axis_pos_next ] = each_new_tiff_array new_hdf5_dataset_axis_pos = new_hdf5_dataset_axis_pos_next
def convert_tiffs(new_tiff_filenames, new_hdf5_pathname, axis=0, channel=0, z_index=0, pages_to_channel=1): """ Convert a stack of tiffs to an HDF5 file. Args: new_tiff_filenames(list or str): takes a str for a single file or a list of strs for filenames to combine (allows regex). new_hdf5_pathname(str): the HDF5 file and location to store the dataset. axis(int): which axis to concatenate along. channel(int): which channel to select for the HDF5 (can only keep one). z_index(int): which z value to take (the algorithm is not setup for 3D data yet) pages_to_channel(int): if channels are not normally stored in the channel variable, but are stored as pages, then this will split neighboring pages into separate channels. """ assert (pages_to_channel > 0) # Get the axes that do not change static_axes = numpy.array(list(iters.xrange_with_skip(3, to_skip=axis))) # if it is only a single str, make it a singleton list if isinstance(new_tiff_filenames, str): new_tiff_filenames = [new_tiff_filenames] # Expand any regex in path names new_tiff_filenames = xglob.expand_pathname_list(*new_tiff_filenames) # Extract the offset and descriptions for storage. new_hdf5_dataset_filenames = list() new_hdf5_dataset_offsets = list() # Determine the shape and dtype to use for the dataset (so that everything # will fit). new_hdf5_dataset_shape = numpy.zeros((3,), dtype=int) new_hdf5_dataset_dtype = bool for each_new_tiff_filename in new_tiff_filenames: # Add each filename. new_hdf5_dataset_filenames.append(each_new_tiff_filename) # Get all of the offsets. new_hdf5_dataset_offsets.append(new_hdf5_dataset_shape[axis]) # Get the shape and type of each frame. each_new_tiff_file_shape, each_new_tiff_file_dtype = get_multipage_tiff_shape_dtype_transformed( each_new_tiff_filename, axis_order="cztyx", pages_to_channel=pages_to_channel ).values() each_new_tiff_file_shape = each_new_tiff_file_shape[2:] # Find the increase on the merge axis. Find the largest shape for the # rest. each_new_tiff_file_shape = numpy.array(each_new_tiff_file_shape) new_hdf5_dataset_shape[axis] += each_new_tiff_file_shape[axis] new_hdf5_dataset_shape[static_axes] = numpy.array( [ new_hdf5_dataset_shape[static_axes], each_new_tiff_file_shape[static_axes] ] ).max(axis=0) # Finds the best type that everything can be cast to without loss of # precision. if not numpy.can_cast(each_new_tiff_file_dtype, new_hdf5_dataset_dtype): if numpy.can_cast(new_hdf5_dataset_dtype, each_new_tiff_file_dtype): new_hdf5_dataset_dtype = each_new_tiff_file_dtype else: raise Exception( "Cannot find safe conversion between" + " new_hdf5_dataset_dtype = " + repr(new_hdf5_dataset_dtype) + " and each_new_tiff_file_dtype = " + repr(each_new_tiff_file_dtype) + "." ) # Convert to arrays. new_hdf5_dataset_filenames = numpy.array(new_hdf5_dataset_filenames) new_hdf5_dataset_offsets = numpy.array(new_hdf5_dataset_offsets) # Convert to standard forms new_hdf5_dataset_shape = tuple(new_hdf5_dataset_shape) new_hdf5_dataset_dtype = numpy.dtype(new_hdf5_dataset_dtype) # Get all the needed locations for the HDF5 file and dataset new_hdf5_path_components = pathHelpers.PathComponents(new_hdf5_pathname) new_hdf5_filename = new_hdf5_path_components.externalPath new_hdf5_groupname = new_hdf5_path_components.internalDirectory new_hdf5_dataset_name = new_hdf5_path_components.internalPath # Dump all datasets to the file with h5py.File(new_hdf5_filename, "a") as new_hdf5_file: new_hdf5_group = new_hdf5_file.require_group(new_hdf5_groupname) new_hdf5_dataset = new_hdf5_group.create_dataset( new_hdf5_dataset_name, new_hdf5_dataset_shape, new_hdf5_dataset_dtype, chunks=True ) new_hdf5_dataset.attrs["filenames"] = new_hdf5_dataset_filenames new_hdf5_dataset.attrs["offsets"] = new_hdf5_dataset_offsets # Workaround required due to this issue # ( https://github.com/h5py/h5py/issues/289 ). new_hdf5_descriptions_dataset = new_hdf5_group.create_dataset( "_".join([new_hdf5_dataset_name, "descriptions"]), shape=new_hdf5_dataset_shape[0:1], dtype=h5py.special_dtype(vlen=unicode) ) new_hdf5_dataset.attrs["descriptions"] = ( new_hdf5_descriptions_dataset.file.filename + new_hdf5_descriptions_dataset.name ) new_hdf5_dataset_axis_pos = 0 for each_new_tiff_filename in new_tiff_filenames: # Read the data in the format specified. each_new_tiff_array = get_standard_tiff_array( each_new_tiff_filename, axis_order="cztyx", pages_to_channel=pages_to_channel ) # Extract the descriptions. each_new_tiff_description = [] each_new_tiff_file = None try: each_new_tiff_file = libtiff.TiffFile( each_new_tiff_filename, 'r' ) for i in xrange(each_new_tiff_file.get_depth()): metadata_i = each_new_tiff_file.IFD[i].entries_dict desc_i = u"" try: desc_i = unicode( metadata_i["ImageDescription"].human() ) except KeyError: pass each_new_tiff_description.append( desc_i ) finally: if each_new_tiff_file: each_new_tiff_file.close() each_new_tiff_file = None each_new_tiff_description = numpy.array(each_new_tiff_description) # Take channel and z selection # TODO: Could we drop the channel constraint? # TODO: Want to drop z constraint. each_new_tiff_array = each_new_tiff_array[channel, z_index] # Store into the current slice and go to the next one. new_hdf5_dataset_axis_pos_next = new_hdf5_dataset_axis_pos + \ len(each_new_tiff_array) new_hdf5_dataset[new_hdf5_dataset_axis_pos:new_hdf5_dataset_axis_pos_next] = each_new_tiff_array new_hdf5_descriptions_dataset[new_hdf5_dataset_axis_pos:new_hdf5_dataset_axis_pos_next] = each_new_tiff_description new_hdf5_dataset_axis_pos = new_hdf5_dataset_axis_pos_next