def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Load Images data as Series data. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data Returns ------- data: lambdaimage.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs. Keys will be n-tuples of int, with n given by dimensionality of the images, and correspond to indexes into the image arrays. Value will have length equal to the number of image files. With each image contributing one point to this value array, with ordering given by the lexicographic ordering of image file names. """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) from lambdaimage.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # tif / tif stack images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() return images.toBlocks(blockSize, units=blockSizeUnits).toSeries()
def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Write out Images data as Series data, saved in a flat binary format. The resulting files may subsequently be read in using lambdaimageContext.loadSeries(). Loading Series data directly will likely be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". outputDirPath: string Path to directory to write Series file output. May be either a path on the local file system or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will be deleted (recursively) if it already exists. (Use with caution.) recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) from lambdaimage.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite)
def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None, startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Loads an Images object from data stored as a binary image stack, tif, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), Amazon S3, or Google Storage. HDFS is not currently supported for image file data. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image format. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype, optional, default = 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use as many partitions as available cores renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data Returns ------- data: lambdaimage.rdds.Images An Images object, wrapping an RDD of with (int) : (numpy array) pairs """ checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack']) from lambdaimage.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if npartitions is None: npartitions = self._sc.defaultParallelism # Checking StartIdx is smaller or equal to StopIdx if startIdx is not None and stopIdx is not None and startIdx > stopIdx: raise Exception("Error. startIdx {} is larger than stopIdx {}".inputFormat(startIdx, stopIdx)) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if inputFormat.lower() == 'stack': data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) elif inputFormat.lower().startswith('tif'): data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: if nplanes: raise NotImplementedError("nplanes argument is not supported for png files") data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) if not renumber: return data else: return data.renumber()