def saveAsBinarySeries(self, outputDirPath, overwrite=False): """Writes out Series-formatted data. Subclasses are *not* expected to override this method. Parameters ---------- outputdirname : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig if not overwrite: from thunder.utils.common import raiseErrorIfPathExists raiseErrorIfPathExists(outputDirPath) overwrite = True # prevent additional downstream checks for this path writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite) binseriesRdd = self.toBinarySeries() binseriesRdd.foreach(writer.writerFcn) writeSeriesConfig(outputDirPath, len(self.dims), self.nimages, keyType='int16', valueType=self.dtype, overwrite=overwrite)
def _checkOverwrite(self, outputDirPath): """ Checks for existence of outputDirPath, raising ValueError if it already exists """ from thunder.utils.aws import AWSCredentials from thunder.utils.common import raiseErrorIfPathExists awsCredentialOverride = AWSCredentials.fromContext(self.rdd.ctx) raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=awsCredentialOverride)
def saveAsBinarySeries(self, outputDirPath, blockSizeSpec="150M", units="pixels", overwrite=False): """Writes this Images object to disk as binary Series data. This method is equivalent to images.toBlocks(blockSizeSpec).saveAsBinarySeries(outputdirname, overwrite) Parameters ---------- blockSizeSpec: string memory size, tuple of positive int, or instance of BlockingStrategy A string spec will be interpreted as a memory size string (e.g. "64M"). The resulting Series data files will be generated by a SimpleBlockingStrategy to be close to the requested size. A tuple of positive ints will be interpreted as either "pixels per dimension" (default) or "splits per dimension", depending on the value of the units parameter. The length of the tuple must match the dimensionality of this Images object. These units will be passed into a SimpleBlockingStrategy which will be used to control the size of the individual files written to disk. If an instance of BlockingStrategy is passed, it will be used to generate the Series data files. outputDirPath : string path or URI to directory to be created Output files will be written underneath outputDirPath. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. units: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec. If a string or a BlockingStrategy instance is passed as blockSizeSpec, this parameter has no effect. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. Returns ------- no return value """ if not overwrite: from thunder.utils.common import raiseErrorIfPathExists raiseErrorIfPathExists(outputDirPath) overwrite = True # prevent additional downstream checks for this path self.toBlocks(blockSizeSpec, units=units).saveAsBinarySeries(outputDirPath, overwrite=overwrite)
def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Write out Images data as Series data, saved in a flat binary format. The resulting files may subsequently be read in using ThunderContext.loadSeries(). Loading Series data directly will likely be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". outputDirPath: string Path to directory to write Series file output. May be either a path on the local file system or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. shuffle: boolean, optional, default = True Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will be deleted (recursively) if it already exists. (Use with caution.) recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive)
def _checkOverwrite(self, outputDirPath): from thunder.utils.common import raiseErrorIfPathExists raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self.awsCredentialsOverride)
def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Write out Images data as Series data, saved in a flat binary format. The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". outputDirPath: string Path to a directory into which to write Series file output. An outputdir argument may be either a path on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of BlockingStrategy. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. This parameter also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to already exist. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError( "Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries( outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError( "nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError( "npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive)
def saveAsBinarySeries(self, outputdirname, overwrite=False): """Writes out Series-formatted data. This method (Series.saveAsBinarySeries) writes out binary series files using the current partitioning of this Series object. (That is, if mySeries.rdd.getNumPartitions() == 5, then 5 files will be written out, one per partition.) The records will not be resorted; the file names for each partition will be taken from the key of the first Series record in that partition. If the Series object is already sorted and no records have been removed by filtering, then the resulting output should be equivalent to what one would get from calling myImages.saveAsBinarySeries(). If all one wishes to do is to save out Images data in a binary series format, then tsc.convertImagesToSeries() will likely be more efficient than tsc.loadImages().toSeries().saveAsBinarySeries(). Parameters ---------- outputdirname : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ import cStringIO as StringIO import struct from thunder.rdds.imgblocks.blocks import SimpleBlocks from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig if not overwrite: from thunder.utils.common import raiseErrorIfPathExists raiseErrorIfPathExists(outputdirname) overwrite = True # prevent additional downstream checks for this path def partitionToBinarySeries(kvIter): """Collects all Series records in a partition into a single binary series record. """ keypacker = None firstKey = None buf = StringIO.StringIO() for seriesKey, series in kvIter: if keypacker is None: keypacker = struct.Struct('h' * len(seriesKey)) firstKey = seriesKey # print >> sys.stderr, seriesKey, series, series.tostring().encode('hex') buf.write(keypacker.pack(*seriesKey)) buf.write(series.tostring()) val = buf.getvalue() buf.close() # we might have an empty partition, in which case firstKey will still be None if firstKey is None: return iter([]) else: label = SimpleBlocks.getBinarySeriesNameForKey( firstKey) + ".bin" return iter([(label, val)]) writer = getParallelWriterForPath(outputdirname)(outputdirname, overwrite=overwrite) binseriesrdd = self.rdd.mapPartitions(partitionToBinarySeries) binseriesrdd.foreach(writer.writerFcn) # TODO: all we really need here are the number of keys and number of values, which could in principle # be cached in _nkeys and _nvals attributes, removing the need for this .first() call in most cases. firstKey, firstVal = self.first() writeSeriesConfig(outputdirname, len(firstKey), len(firstVal), keyType='int16', valueType=self.dtype, overwrite=overwrite)
def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Write out Images data as Series data, saved in a flat binary format. The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". outputDirPath: string Path to a directory into which to write Series file output. An outputdir argument may be either a path on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of BlockingStrategy. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. This parameter also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to already exist. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError("Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self.awsCredentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive)